Add scripts

This commit is contained in:
Слонова Анна 2026-04-21 18:17:28 +00:00
parent 3e0cf88814
commit 30479e6c37
2 changed files with 205 additions and 0 deletions

202
scripts/generate_report.py Normal file
View file

@ -0,0 +1,202 @@
#!/usr/bin/env python3
"""
Сбор данных о коммитах из всех веток репозитория с полным diff.
Сохраняет JSON в /app/hermes_data/git_reports/<repo>_data.json
"""
import os
import re
import sys
import json
import argparse
import subprocess
import tempfile
from datetime import datetime
from collections import defaultdict
REPORTS_BASE_DIR = "/app/hermes_data/git_reports"
def get_all_commits_with_diff(repo_url, since=None, until=None, max_diff_size=5000, max_commits=500):
"""
Клонирует репозиторий, получает коммиты из всех веток через git log --all.
Возвращает список коммитов с полями:
sha, message, author, date, diff, files_changed, insertions, deletions, branches (list)
"""
clone_dir = tempfile.mkdtemp(prefix='git_report_')
try:
subprocess.run(['git', 'clone', repo_url, clone_dir],
check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
print(f"Clone failed: {e.stderr}", file=sys.stderr)
return []
# Загружаем все ветки и теги
try:
subprocess.run(['git', '-C', clone_dir, 'fetch', '--all'],
check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
print(f"Fetch --all failed: {e.stderr}", file=sys.stderr)
# продолжаем, возможно, ветки уже есть
# Получаем список коммитов из всех веток (--all)
log_args = ['git', '-C', clone_dir, 'log', '--all', '--pretty=format:%H|%an|%ai|%s']
if since:
log_args.append(f'--since={since.isoformat()}')
if until:
log_args.append(f'--until={until.isoformat()}')
try:
result = subprocess.run(log_args, capture_output=True, text=True, check=True)
lines = result.stdout.strip().split('\n')
if not lines or (len(lines) == 1 and not lines[0]):
print("No commits found.", file=sys.stderr)
subprocess.run(['rm', '-rf', clone_dir])
return []
except subprocess.CalledProcessError as e:
print(f"git log --all failed: {e.stderr}", file=sys.stderr)
subprocess.run(['rm', '-rf', clone_dir])
return []
# Для каждого коммита определим, в каких ветках он встречается
# Сначала соберём все коммиты без информации о ветках
commits_dict = {}
commit_order = []
for line in lines:
if not line.strip():
continue
parts = line.split('|', 3)
if len(parts) < 4:
continue
sha, author, date, subject = parts
commits_dict[sha] = {
'sha': sha[:8],
'author': author,
'date': date,
'message': subject,
'diff': '',
'files_changed': [],
'insertions': 0,
'deletions': 0,
'branches': []
}
commit_order.append(sha)
# Получаем информацию о ветках для каждого коммита
# Используем git branch --contains
for sha in commit_order[:max_commits]:
try:
branch_proc = subprocess.run(
['git', '-C', clone_dir, 'branch', '-r', '--contains', sha],
capture_output=True, text=True, check=True
)
branches = [b.strip() for b in branch_proc.stdout.split('\n') if b.strip()]
# Добавляем также локальные ветки
local_proc = subprocess.run(
['git', '-C', clone_dir, 'branch', '--contains', sha],
capture_output=True, text=True, check=True
)
local_branches = [b.strip().lstrip('* ') for b in local_proc.stdout.split('\n') if b.strip()]
all_branches = list(set(branches + local_branches))
commits_dict[sha]['branches'] = all_branches
except subprocess.CalledProcessError:
commits_dict[sha]['branches'] = []
# Теперь собираем diff и статистику для каждого коммита (ограничимся max_commits)
commits = []
for idx, sha in enumerate(commit_order):
if idx >= max_commits:
print(f"Reached max commits limit ({max_commits}), stopping.", file=sys.stderr)
break
commit = commits_dict[sha]
# Получаем diff (с флагом -m для merge-коммитов)
try:
diff_proc = subprocess.run(
['git', '-C', clone_dir, 'show', '-m', '--patch', '--unified=3', sha],
capture_output=True, text=True, check=True
)
diff_output = diff_proc.stdout
if not diff_output.strip():
diff_output = "(no diff content possibly empty commit or merge commit without changes)"
except subprocess.CalledProcessError as e:
diff_output = f"(error getting diff: {e.stderr})"
if len(diff_output) > max_diff_size:
diff_output = diff_output[:max_diff_size] + "\n... (diff truncated)"
commit['diff'] = diff_output
# Получаем статистику
try:
stat_proc = subprocess.run(
['git', '-C', clone_dir, 'show', '--numstat', '--pretty=format:', sha],
capture_output=True, text=True, check=True
)
stat_lines = stat_proc.stdout.strip().split('\n')
insertions = 0
deletions = 0
files_changed = []
for sl in stat_lines:
if not sl.strip():
continue
parts_stat = sl.split('\t')
if len(parts_stat) >= 3:
add_str, del_str, fname = parts_stat[0], parts_stat[1], parts_stat[2]
if add_str != '-':
insertions += int(add_str)
if del_str != '-':
deletions += int(del_str)
files_changed.append(fname)
commit['insertions'] = insertions
commit['deletions'] = deletions
commit['files_changed'] = files_changed
except subprocess.CalledProcessError:
commit['insertions'] = 0
commit['deletions'] = 0
commit['files_changed'] = []
# Отладочный вывод
print(f"Commit {commit['sha']}: diff size = {len(commit['diff'])} bytes, branches: {len(commit['branches'])}", file=sys.stderr)
commits.append(commit)
subprocess.run(['rm', '-rf', clone_dir])
return commits
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--repo-url', required=True)
parser.add_argument('--since', help='YYYY-MM-DD')
parser.add_argument('--until', help='YYYY-MM-DD')
parser.add_argument('--max-diff-size', type=int, default=5000,
help='Max characters of diff per commit (default 5000)')
parser.add_argument('--max-commits', type=int, default=500,
help='Max commits to process (default 500)')
args = parser.parse_args()
since = datetime.fromisoformat(args.since) if args.since else None
until = datetime.fromisoformat(args.until) if args.until else None
repo_name_match = re.search(r'([^/]+?)(?:\.git)?$', args.repo_url.rstrip('/'))
repo_name = repo_name_match.group(1) if repo_name_match else "unknown"
print(f"Analyzing {repo_name} (all branches)...", file=sys.stderr)
commits = get_all_commits_with_diff(args.repo_url, since, until,
args.max_diff_size, args.max_commits)
data = {
"repo": repo_name,
"period": {
"since": since.isoformat() if since else "full_history_start",
"until": until.isoformat() if until else "full_history_end"
},
"commits": commits,
"issues": []
}
os.makedirs(REPORTS_BASE_DIR, exist_ok=True)
data_file = os.path.join(REPORTS_BASE_DIR, f"{repo_name}_data.json")
with open(data_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(data_file)
if __name__ == '__main__':
main()

3
scripts/requirements.txt Normal file
View file

@ -0,0 +1,3 @@
requests>=2.31.0
GitPython>=3.1.40
python-dateutil>=2.8.2