评审专家 agent

2026-02-10 10:54:09 +00:00
parent 8df9aa369c
commit 3564b5eb57
7 changed files with 2875 additions and 0 deletions
--- a/scripts/extract_paper_text.py
+++ b/scripts/extract_paper_text.py
@@ -0,0 +1,80 @@
+"""Extract text from benchmark PDF and paper DOCX.
+
+This script exists to work around environments where the chat tooling
+cannot directly read PDF/DOCX. It generates plain text files that are
+easy for an agent to read and quote.
+
+Usage:
+  python scripts/extract_paper_text.py --benchmark "标杆论文.pdf" --paper "我的论文/飞机稿_20260130.docx" --out "评审输出/飞机稿_20260130/raw"
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+from pathlib import Path
+
+
+def extract_pdf_text(pdf_path: Path) -> str:
+    from pypdf import PdfReader
+
+    reader = PdfReader(str(pdf_path))
+    parts = []
+    for i, page in enumerate(reader.pages, start=1):
+        text = page.extract_text() or ""
+        parts.append(f"\n\n===== PAGE {i} =====\n\n{text}")
+    return "".join(parts).strip() + "\n"
+
+
+def extract_docx_text(docx_path: Path) -> str:
+    import docx  # python-docx
+
+    d = docx.Document(str(docx_path))
+    lines = []
+
+    # paragraphs
+    for p in d.paragraphs:
+        t = (p.text or "").strip()
+        if t:
+            lines.append(t)
+
+    # tables (best-effort)
+    for table in d.tables:
+        for row in table.rows:
+            cells = [c.text.strip() for c in row.cells]
+            if any(cells):
+                lines.append("\t".join(cells))
+
+    return "\n".join(lines).strip() + "\n"
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--benchmark", required=True, help="Path to benchmark PDF")
+    ap.add_argument("--paper", required=True, help="Path to target paper DOCX")
+    ap.add_argument("--out", required=True, help="Output directory")
+    args = ap.parse_args()
+
+    benchmark_path = Path(args.benchmark)
+    paper_path = Path(args.paper)
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    if not benchmark_path.exists():
+        raise SystemExit(f"Benchmark not found: {benchmark_path}")
+    if not paper_path.exists():
+        raise SystemExit(f"Paper not found: {paper_path}")
+
+    benchmark_txt = extract_pdf_text(benchmark_path)
+    paper_txt = extract_docx_text(paper_path)
+
+    (out_dir / "benchmark.txt").write_text(benchmark_txt, encoding="utf-8")
+    (out_dir / "paper.txt").write_text(paper_txt, encoding="utf-8")
+
+    print("OK")
+    print(f"- {out_dir / 'benchmark.txt'}")
+    print(f"- {out_dir / 'paper.txt'}")
+
+
+if __name__ == "__main__":
+    main()