评审专家 agent
This commit is contained in:
80
scripts/extract_paper_text.py
Normal file
80
scripts/extract_paper_text.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""Extract text from benchmark PDF and paper DOCX.
|
||||
|
||||
This script exists to work around environments where the chat tooling
|
||||
cannot directly read PDF/DOCX. It generates plain text files that are
|
||||
easy for an agent to read and quote.
|
||||
|
||||
Usage:
|
||||
python scripts/extract_paper_text.py --benchmark "标杆论文.pdf" --paper "我的论文/飞机稿_20260130.docx" --out "评审输出/飞机稿_20260130/raw"
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def extract_pdf_text(pdf_path: Path) -> str:
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(str(pdf_path))
|
||||
parts = []
|
||||
for i, page in enumerate(reader.pages, start=1):
|
||||
text = page.extract_text() or ""
|
||||
parts.append(f"\n\n===== PAGE {i} =====\n\n{text}")
|
||||
return "".join(parts).strip() + "\n"
|
||||
|
||||
|
||||
def extract_docx_text(docx_path: Path) -> str:
|
||||
import docx # python-docx
|
||||
|
||||
d = docx.Document(str(docx_path))
|
||||
lines = []
|
||||
|
||||
# paragraphs
|
||||
for p in d.paragraphs:
|
||||
t = (p.text or "").strip()
|
||||
if t:
|
||||
lines.append(t)
|
||||
|
||||
# tables (best-effort)
|
||||
for table in d.tables:
|
||||
for row in table.rows:
|
||||
cells = [c.text.strip() for c in row.cells]
|
||||
if any(cells):
|
||||
lines.append("\t".join(cells))
|
||||
|
||||
return "\n".join(lines).strip() + "\n"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--benchmark", required=True, help="Path to benchmark PDF")
|
||||
ap.add_argument("--paper", required=True, help="Path to target paper DOCX")
|
||||
ap.add_argument("--out", required=True, help="Output directory")
|
||||
args = ap.parse_args()
|
||||
|
||||
benchmark_path = Path(args.benchmark)
|
||||
paper_path = Path(args.paper)
|
||||
out_dir = Path(args.out)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not benchmark_path.exists():
|
||||
raise SystemExit(f"Benchmark not found: {benchmark_path}")
|
||||
if not paper_path.exists():
|
||||
raise SystemExit(f"Paper not found: {paper_path}")
|
||||
|
||||
benchmark_txt = extract_pdf_text(benchmark_path)
|
||||
paper_txt = extract_docx_text(paper_path)
|
||||
|
||||
(out_dir / "benchmark.txt").write_text(benchmark_txt, encoding="utf-8")
|
||||
(out_dir / "paper.txt").write_text(paper_txt, encoding="utf-8")
|
||||
|
||||
print("OK")
|
||||
print(f"- {out_dir / 'benchmark.txt'}")
|
||||
print(f"- {out_dir / 'paper.txt'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user