#!/usr/bin/env python3 """ Сбор данных о коммитах из всех веток репозитория с полным diff. Сохраняет JSON в /app/hermes_data/git_reports/_data.json """ import os import re import sys import json import argparse import subprocess import tempfile from datetime import datetime from collections import defaultdict REPORTS_BASE_DIR = "/app/hermes_data/git_reports" def get_all_commits_with_diff(repo_url, since=None, until=None, max_diff_size=5000, max_commits=500): """ Клонирует репозиторий, получает коммиты из всех веток через git log --all. Возвращает список коммитов с полями: sha, message, author, date, diff, files_changed, insertions, deletions, branches (list) """ clone_dir = tempfile.mkdtemp(prefix='git_report_') try: subprocess.run(['git', 'clone', repo_url, clone_dir], check=True, capture_output=True, text=True) except subprocess.CalledProcessError as e: print(f"Clone failed: {e.stderr}", file=sys.stderr) return [] # Загружаем все ветки и теги try: subprocess.run(['git', '-C', clone_dir, 'fetch', '--all'], check=True, capture_output=True, text=True) except subprocess.CalledProcessError as e: print(f"Fetch --all failed: {e.stderr}", file=sys.stderr) # продолжаем, возможно, ветки уже есть # Получаем список коммитов из всех веток (--all) log_args = ['git', '-C', clone_dir, 'log', '--all', '--pretty=format:%H|%an|%ai|%s'] if since: log_args.append(f'--since={since.isoformat()}') if until: log_args.append(f'--until={until.isoformat()}') try: result = subprocess.run(log_args, capture_output=True, text=True, check=True) lines = result.stdout.strip().split('\n') if not lines or (len(lines) == 1 and not lines[0]): print("No commits found.", file=sys.stderr) subprocess.run(['rm', '-rf', clone_dir]) return [] except subprocess.CalledProcessError as e: print(f"git log --all failed: {e.stderr}", file=sys.stderr) subprocess.run(['rm', '-rf', clone_dir]) return [] # Для каждого коммита определим, в каких ветках он встречается # Сначала соберём все коммиты без информации о ветках commits_dict = {} commit_order = [] for line in lines: if not line.strip(): continue parts = line.split('|', 3) if len(parts) < 4: continue sha, author, date, subject = parts commits_dict[sha] = { 'sha': sha[:8], 'author': author, 'date': date, 'message': subject, 'diff': '', 'files_changed': [], 'insertions': 0, 'deletions': 0, 'branches': [] } commit_order.append(sha) # Получаем информацию о ветках для каждого коммита # Используем git branch --contains for sha in commit_order[:max_commits]: try: branch_proc = subprocess.run( ['git', '-C', clone_dir, 'branch', '-r', '--contains', sha], capture_output=True, text=True, check=True ) branches = [b.strip() for b in branch_proc.stdout.split('\n') if b.strip()] # Добавляем также локальные ветки local_proc = subprocess.run( ['git', '-C', clone_dir, 'branch', '--contains', sha], capture_output=True, text=True, check=True ) local_branches = [b.strip().lstrip('* ') for b in local_proc.stdout.split('\n') if b.strip()] all_branches = list(set(branches + local_branches)) commits_dict[sha]['branches'] = all_branches except subprocess.CalledProcessError: commits_dict[sha]['branches'] = [] # Теперь собираем diff и статистику для каждого коммита (ограничимся max_commits) commits = [] for idx, sha in enumerate(commit_order): if idx >= max_commits: print(f"Reached max commits limit ({max_commits}), stopping.", file=sys.stderr) break commit = commits_dict[sha] # Получаем diff (с флагом -m для merge-коммитов) try: diff_proc = subprocess.run( ['git', '-C', clone_dir, 'show', '-m', '--patch', '--unified=3', sha], capture_output=True, text=True, check=True ) diff_output = diff_proc.stdout if not diff_output.strip(): diff_output = "(no diff content – possibly empty commit or merge commit without changes)" except subprocess.CalledProcessError as e: diff_output = f"(error getting diff: {e.stderr})" if len(diff_output) > max_diff_size: diff_output = diff_output[:max_diff_size] + "\n... (diff truncated)" commit['diff'] = diff_output # Получаем статистику try: stat_proc = subprocess.run( ['git', '-C', clone_dir, 'show', '--numstat', '--pretty=format:', sha], capture_output=True, text=True, check=True ) stat_lines = stat_proc.stdout.strip().split('\n') insertions = 0 deletions = 0 files_changed = [] for sl in stat_lines: if not sl.strip(): continue parts_stat = sl.split('\t') if len(parts_stat) >= 3: add_str, del_str, fname = parts_stat[0], parts_stat[1], parts_stat[2] if add_str != '-': insertions += int(add_str) if del_str != '-': deletions += int(del_str) files_changed.append(fname) commit['insertions'] = insertions commit['deletions'] = deletions commit['files_changed'] = files_changed except subprocess.CalledProcessError: commit['insertions'] = 0 commit['deletions'] = 0 commit['files_changed'] = [] # Отладочный вывод print(f"Commit {commit['sha']}: diff size = {len(commit['diff'])} bytes, branches: {len(commit['branches'])}", file=sys.stderr) commits.append(commit) subprocess.run(['rm', '-rf', clone_dir]) return commits def main(): parser = argparse.ArgumentParser() parser.add_argument('--repo-url', required=True) parser.add_argument('--since', help='YYYY-MM-DD') parser.add_argument('--until', help='YYYY-MM-DD') parser.add_argument('--max-diff-size', type=int, default=5000, help='Max characters of diff per commit (default 5000)') parser.add_argument('--max-commits', type=int, default=500, help='Max commits to process (default 500)') args = parser.parse_args() since = datetime.fromisoformat(args.since) if args.since else None until = datetime.fromisoformat(args.until) if args.until else None repo_name_match = re.search(r'([^/]+?)(?:\.git)?$', args.repo_url.rstrip('/')) repo_name = repo_name_match.group(1) if repo_name_match else "unknown" print(f"Analyzing {repo_name} (all branches)...", file=sys.stderr) commits = get_all_commits_with_diff(args.repo_url, since, until, args.max_diff_size, args.max_commits) data = { "repo": repo_name, "period": { "since": since.isoformat() if since else "full_history_start", "until": until.isoformat() if until else "full_history_end" }, "commits": commits, "issues": [] } os.makedirs(REPORTS_BASE_DIR, exist_ok=True) data_file = os.path.join(REPORTS_BASE_DIR, f"{repo_name}_data.json") with open(data_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(data_file) if __name__ == '__main__': main()