202 lines
No EOL
8.5 KiB
Python
202 lines
No EOL
8.5 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Сбор данных о коммитах из всех веток репозитория с полным diff.
|
||
Сохраняет JSON в /app/hermes_data/git_reports/<repo>_data.json
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import sys
|
||
import json
|
||
import argparse
|
||
import subprocess
|
||
import tempfile
|
||
from datetime import datetime
|
||
from collections import defaultdict
|
||
|
||
REPORTS_BASE_DIR = "/app/hermes_data/git_reports"
|
||
|
||
def get_all_commits_with_diff(repo_url, since=None, until=None, max_diff_size=5000, max_commits=500):
|
||
"""
|
||
Клонирует репозиторий, получает коммиты из всех веток через git log --all.
|
||
Возвращает список коммитов с полями:
|
||
sha, message, author, date, diff, files_changed, insertions, deletions, branches (list)
|
||
"""
|
||
clone_dir = tempfile.mkdtemp(prefix='git_report_')
|
||
try:
|
||
subprocess.run(['git', 'clone', repo_url, clone_dir],
|
||
check=True, capture_output=True, text=True)
|
||
except subprocess.CalledProcessError as e:
|
||
print(f"Clone failed: {e.stderr}", file=sys.stderr)
|
||
return []
|
||
|
||
# Загружаем все ветки и теги
|
||
try:
|
||
subprocess.run(['git', '-C', clone_dir, 'fetch', '--all'],
|
||
check=True, capture_output=True, text=True)
|
||
except subprocess.CalledProcessError as e:
|
||
print(f"Fetch --all failed: {e.stderr}", file=sys.stderr)
|
||
# продолжаем, возможно, ветки уже есть
|
||
|
||
# Получаем список коммитов из всех веток (--all)
|
||
log_args = ['git', '-C', clone_dir, 'log', '--all', '--pretty=format:%H|%an|%ai|%s']
|
||
if since:
|
||
log_args.append(f'--since={since.isoformat()}')
|
||
if until:
|
||
log_args.append(f'--until={until.isoformat()}')
|
||
|
||
try:
|
||
result = subprocess.run(log_args, capture_output=True, text=True, check=True)
|
||
lines = result.stdout.strip().split('\n')
|
||
if not lines or (len(lines) == 1 and not lines[0]):
|
||
print("No commits found.", file=sys.stderr)
|
||
subprocess.run(['rm', '-rf', clone_dir])
|
||
return []
|
||
except subprocess.CalledProcessError as e:
|
||
print(f"git log --all failed: {e.stderr}", file=sys.stderr)
|
||
subprocess.run(['rm', '-rf', clone_dir])
|
||
return []
|
||
|
||
# Для каждого коммита определим, в каких ветках он встречается
|
||
# Сначала соберём все коммиты без информации о ветках
|
||
commits_dict = {}
|
||
commit_order = []
|
||
for line in lines:
|
||
if not line.strip():
|
||
continue
|
||
parts = line.split('|', 3)
|
||
if len(parts) < 4:
|
||
continue
|
||
sha, author, date, subject = parts
|
||
commits_dict[sha] = {
|
||
'sha': sha[:8],
|
||
'author': author,
|
||
'date': date,
|
||
'message': subject,
|
||
'diff': '',
|
||
'files_changed': [],
|
||
'insertions': 0,
|
||
'deletions': 0,
|
||
'branches': []
|
||
}
|
||
commit_order.append(sha)
|
||
|
||
# Получаем информацию о ветках для каждого коммита
|
||
# Используем git branch --contains
|
||
for sha in commit_order[:max_commits]:
|
||
try:
|
||
branch_proc = subprocess.run(
|
||
['git', '-C', clone_dir, 'branch', '-r', '--contains', sha],
|
||
capture_output=True, text=True, check=True
|
||
)
|
||
branches = [b.strip() for b in branch_proc.stdout.split('\n') if b.strip()]
|
||
# Добавляем также локальные ветки
|
||
local_proc = subprocess.run(
|
||
['git', '-C', clone_dir, 'branch', '--contains', sha],
|
||
capture_output=True, text=True, check=True
|
||
)
|
||
local_branches = [b.strip().lstrip('* ') for b in local_proc.stdout.split('\n') if b.strip()]
|
||
all_branches = list(set(branches + local_branches))
|
||
commits_dict[sha]['branches'] = all_branches
|
||
except subprocess.CalledProcessError:
|
||
commits_dict[sha]['branches'] = []
|
||
|
||
# Теперь собираем diff и статистику для каждого коммита (ограничимся max_commits)
|
||
commits = []
|
||
for idx, sha in enumerate(commit_order):
|
||
if idx >= max_commits:
|
||
print(f"Reached max commits limit ({max_commits}), stopping.", file=sys.stderr)
|
||
break
|
||
commit = commits_dict[sha]
|
||
|
||
# Получаем diff (с флагом -m для merge-коммитов)
|
||
try:
|
||
diff_proc = subprocess.run(
|
||
['git', '-C', clone_dir, 'show', '-m', '--patch', '--unified=3', sha],
|
||
capture_output=True, text=True, check=True
|
||
)
|
||
diff_output = diff_proc.stdout
|
||
if not diff_output.strip():
|
||
diff_output = "(no diff content – possibly empty commit or merge commit without changes)"
|
||
except subprocess.CalledProcessError as e:
|
||
diff_output = f"(error getting diff: {e.stderr})"
|
||
|
||
if len(diff_output) > max_diff_size:
|
||
diff_output = diff_output[:max_diff_size] + "\n... (diff truncated)"
|
||
commit['diff'] = diff_output
|
||
|
||
# Получаем статистику
|
||
try:
|
||
stat_proc = subprocess.run(
|
||
['git', '-C', clone_dir, 'show', '--numstat', '--pretty=format:', sha],
|
||
capture_output=True, text=True, check=True
|
||
)
|
||
stat_lines = stat_proc.stdout.strip().split('\n')
|
||
insertions = 0
|
||
deletions = 0
|
||
files_changed = []
|
||
for sl in stat_lines:
|
||
if not sl.strip():
|
||
continue
|
||
parts_stat = sl.split('\t')
|
||
if len(parts_stat) >= 3:
|
||
add_str, del_str, fname = parts_stat[0], parts_stat[1], parts_stat[2]
|
||
if add_str != '-':
|
||
insertions += int(add_str)
|
||
if del_str != '-':
|
||
deletions += int(del_str)
|
||
files_changed.append(fname)
|
||
commit['insertions'] = insertions
|
||
commit['deletions'] = deletions
|
||
commit['files_changed'] = files_changed
|
||
except subprocess.CalledProcessError:
|
||
commit['insertions'] = 0
|
||
commit['deletions'] = 0
|
||
commit['files_changed'] = []
|
||
|
||
# Отладочный вывод
|
||
print(f"Commit {commit['sha']}: diff size = {len(commit['diff'])} bytes, branches: {len(commit['branches'])}", file=sys.stderr)
|
||
commits.append(commit)
|
||
|
||
subprocess.run(['rm', '-rf', clone_dir])
|
||
return commits
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument('--repo-url', required=True)
|
||
parser.add_argument('--since', help='YYYY-MM-DD')
|
||
parser.add_argument('--until', help='YYYY-MM-DD')
|
||
parser.add_argument('--max-diff-size', type=int, default=5000,
|
||
help='Max characters of diff per commit (default 5000)')
|
||
parser.add_argument('--max-commits', type=int, default=500,
|
||
help='Max commits to process (default 500)')
|
||
args = parser.parse_args()
|
||
|
||
since = datetime.fromisoformat(args.since) if args.since else None
|
||
until = datetime.fromisoformat(args.until) if args.until else None
|
||
|
||
repo_name_match = re.search(r'([^/]+?)(?:\.git)?$', args.repo_url.rstrip('/'))
|
||
repo_name = repo_name_match.group(1) if repo_name_match else "unknown"
|
||
|
||
print(f"Analyzing {repo_name} (all branches)...", file=sys.stderr)
|
||
commits = get_all_commits_with_diff(args.repo_url, since, until,
|
||
args.max_diff_size, args.max_commits)
|
||
|
||
data = {
|
||
"repo": repo_name,
|
||
"period": {
|
||
"since": since.isoformat() if since else "full_history_start",
|
||
"until": until.isoformat() if until else "full_history_end"
|
||
},
|
||
"commits": commits,
|
||
"issues": []
|
||
}
|
||
|
||
os.makedirs(REPORTS_BASE_DIR, exist_ok=True)
|
||
data_file = os.path.join(REPORTS_BASE_DIR, f"{repo_name}_data.json")
|
||
with open(data_file, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
print(data_file)
|
||
|
||
if __name__ == '__main__':
|
||
main() |