diff --git a/scripts/generate_report.py b/scripts/generate_report.py new file mode 100644 index 0000000..0b14929 --- /dev/null +++ b/scripts/generate_report.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +""" +Сбор данных о коммитах из всех веток репозитория с полным diff. +Сохраняет JSON в /app/hermes_data/git_reports/_data.json +""" + +import os +import re +import sys +import json +import argparse +import subprocess +import tempfile +from datetime import datetime +from collections import defaultdict + +REPORTS_BASE_DIR = "/app/hermes_data/git_reports" + +def get_all_commits_with_diff(repo_url, since=None, until=None, max_diff_size=5000, max_commits=500): + """ + Клонирует репозиторий, получает коммиты из всех веток через git log --all. + Возвращает список коммитов с полями: + sha, message, author, date, diff, files_changed, insertions, deletions, branches (list) + """ + clone_dir = tempfile.mkdtemp(prefix='git_report_') + try: + subprocess.run(['git', 'clone', repo_url, clone_dir], + check=True, capture_output=True, text=True) + except subprocess.CalledProcessError as e: + print(f"Clone failed: {e.stderr}", file=sys.stderr) + return [] + + # Загружаем все ветки и теги + try: + subprocess.run(['git', '-C', clone_dir, 'fetch', '--all'], + check=True, capture_output=True, text=True) + except subprocess.CalledProcessError as e: + print(f"Fetch --all failed: {e.stderr}", file=sys.stderr) + # продолжаем, возможно, ветки уже есть + + # Получаем список коммитов из всех веток (--all) + log_args = ['git', '-C', clone_dir, 'log', '--all', '--pretty=format:%H|%an|%ai|%s'] + if since: + log_args.append(f'--since={since.isoformat()}') + if until: + log_args.append(f'--until={until.isoformat()}') + + try: + result = subprocess.run(log_args, capture_output=True, text=True, check=True) + lines = result.stdout.strip().split('\n') + if not lines or (len(lines) == 1 and not lines[0]): + print("No commits found.", file=sys.stderr) + subprocess.run(['rm', '-rf', clone_dir]) + return [] + except subprocess.CalledProcessError as e: + print(f"git log --all failed: {e.stderr}", file=sys.stderr) + subprocess.run(['rm', '-rf', clone_dir]) + return [] + + # Для каждого коммита определим, в каких ветках он встречается + # Сначала соберём все коммиты без информации о ветках + commits_dict = {} + commit_order = [] + for line in lines: + if not line.strip(): + continue + parts = line.split('|', 3) + if len(parts) < 4: + continue + sha, author, date, subject = parts + commits_dict[sha] = { + 'sha': sha[:8], + 'author': author, + 'date': date, + 'message': subject, + 'diff': '', + 'files_changed': [], + 'insertions': 0, + 'deletions': 0, + 'branches': [] + } + commit_order.append(sha) + + # Получаем информацию о ветках для каждого коммита + # Используем git branch --contains + for sha in commit_order[:max_commits]: + try: + branch_proc = subprocess.run( + ['git', '-C', clone_dir, 'branch', '-r', '--contains', sha], + capture_output=True, text=True, check=True + ) + branches = [b.strip() for b in branch_proc.stdout.split('\n') if b.strip()] + # Добавляем также локальные ветки + local_proc = subprocess.run( + ['git', '-C', clone_dir, 'branch', '--contains', sha], + capture_output=True, text=True, check=True + ) + local_branches = [b.strip().lstrip('* ') for b in local_proc.stdout.split('\n') if b.strip()] + all_branches = list(set(branches + local_branches)) + commits_dict[sha]['branches'] = all_branches + except subprocess.CalledProcessError: + commits_dict[sha]['branches'] = [] + + # Теперь собираем diff и статистику для каждого коммита (ограничимся max_commits) + commits = [] + for idx, sha in enumerate(commit_order): + if idx >= max_commits: + print(f"Reached max commits limit ({max_commits}), stopping.", file=sys.stderr) + break + commit = commits_dict[sha] + + # Получаем diff (с флагом -m для merge-коммитов) + try: + diff_proc = subprocess.run( + ['git', '-C', clone_dir, 'show', '-m', '--patch', '--unified=3', sha], + capture_output=True, text=True, check=True + ) + diff_output = diff_proc.stdout + if not diff_output.strip(): + diff_output = "(no diff content – possibly empty commit or merge commit without changes)" + except subprocess.CalledProcessError as e: + diff_output = f"(error getting diff: {e.stderr})" + + if len(diff_output) > max_diff_size: + diff_output = diff_output[:max_diff_size] + "\n... (diff truncated)" + commit['diff'] = diff_output + + # Получаем статистику + try: + stat_proc = subprocess.run( + ['git', '-C', clone_dir, 'show', '--numstat', '--pretty=format:', sha], + capture_output=True, text=True, check=True + ) + stat_lines = stat_proc.stdout.strip().split('\n') + insertions = 0 + deletions = 0 + files_changed = [] + for sl in stat_lines: + if not sl.strip(): + continue + parts_stat = sl.split('\t') + if len(parts_stat) >= 3: + add_str, del_str, fname = parts_stat[0], parts_stat[1], parts_stat[2] + if add_str != '-': + insertions += int(add_str) + if del_str != '-': + deletions += int(del_str) + files_changed.append(fname) + commit['insertions'] = insertions + commit['deletions'] = deletions + commit['files_changed'] = files_changed + except subprocess.CalledProcessError: + commit['insertions'] = 0 + commit['deletions'] = 0 + commit['files_changed'] = [] + + # Отладочный вывод + print(f"Commit {commit['sha']}: diff size = {len(commit['diff'])} bytes, branches: {len(commit['branches'])}", file=sys.stderr) + commits.append(commit) + + subprocess.run(['rm', '-rf', clone_dir]) + return commits + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--repo-url', required=True) + parser.add_argument('--since', help='YYYY-MM-DD') + parser.add_argument('--until', help='YYYY-MM-DD') + parser.add_argument('--max-diff-size', type=int, default=5000, + help='Max characters of diff per commit (default 5000)') + parser.add_argument('--max-commits', type=int, default=500, + help='Max commits to process (default 500)') + args = parser.parse_args() + + since = datetime.fromisoformat(args.since) if args.since else None + until = datetime.fromisoformat(args.until) if args.until else None + + repo_name_match = re.search(r'([^/]+?)(?:\.git)?$', args.repo_url.rstrip('/')) + repo_name = repo_name_match.group(1) if repo_name_match else "unknown" + + print(f"Analyzing {repo_name} (all branches)...", file=sys.stderr) + commits = get_all_commits_with_diff(args.repo_url, since, until, + args.max_diff_size, args.max_commits) + + data = { + "repo": repo_name, + "period": { + "since": since.isoformat() if since else "full_history_start", + "until": until.isoformat() if until else "full_history_end" + }, + "commits": commits, + "issues": [] + } + + os.makedirs(REPORTS_BASE_DIR, exist_ok=True) + data_file = os.path.join(REPORTS_BASE_DIR, f"{repo_name}_data.json") + with open(data_file, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + print(data_file) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..dfd15dc --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.31.0 +GitPython>=3.1.40 +python-dateutil>=2.8.2 \ No newline at end of file