Spaces:
Running
Running
chore: initialize virtual environment dependencies and project structure
Browse files- CIKM-26-Paper-Reader +1 -0
- src/daily_retrieve.py +42 -35
CIKM-26-Paper-Reader
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 0436ef63584ada08d02e8129739e263aa4a0ead2
|
src/daily_retrieve.py
CHANGED
|
@@ -6,6 +6,7 @@ Usage:
|
|
| 6 |
uv run python src/daily_retrieve.py --date 2026-03-25 # single day
|
| 7 |
uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 # range
|
| 8 |
uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 --workers 4
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
import argparse
|
|
@@ -664,7 +665,8 @@ def _log(date_str: str, msg: str):
|
|
| 664 |
print(f"[{date_str}] {msg}", flush=True)
|
| 665 |
|
| 666 |
|
| 667 |
-
def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay | None = None
|
|
|
|
| 668 |
"""Fetch, summarize, and push papers for a single date. Returns status string."""
|
| 669 |
log = display.update if display else lambda d, done, total, s: _log(d, s)
|
| 670 |
|
|
@@ -672,17 +674,18 @@ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay
|
|
| 672 |
output_path = DATA_DIR / f"hf_papers_{date_str}_summarized.json"
|
| 673 |
|
| 674 |
# --- Check if fully done on HF (papers + trending) ---
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
|
|
|
| 686 |
|
| 687 |
# --- 1) Always fetch fresh paper list ---
|
| 688 |
log(date_str, 0, 0, "fetching...")
|
|
@@ -694,25 +697,28 @@ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay
|
|
| 694 |
total = len(papers)
|
| 695 |
|
| 696 |
# --- 2) Merge cached summaries (local JSON then HF dataset) ---
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
|
|
|
|
|
|
|
|
|
| 716 |
|
| 717 |
# --- 3) Summarize papers not yet processed ---
|
| 718 |
done = sum(1 for p in papers if _paper_is_processed(p))
|
|
@@ -757,7 +763,7 @@ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay
|
|
| 757 |
trending = None
|
| 758 |
trending_split = _date_to_split(date_str)
|
| 759 |
trending_files = _list_hf_files(HF_TRENDING_REPO)
|
| 760 |
-
if any(trending_split in f for f in trending_files):
|
| 761 |
log(date_str, total, total, "trending cached on HF")
|
| 762 |
else:
|
| 763 |
log(date_str, total, total, "generating trending...")
|
|
@@ -822,6 +828,7 @@ def main():
|
|
| 822 |
parser.add_argument("--end", type=_parse_date, default=None, help="End date YYYY-MM-DD (inclusive, for range retrieval)")
|
| 823 |
parser.add_argument("--workers", type=int, default=1, help="Number of parallel workers (default: 1)")
|
| 824 |
parser.add_argument("--no-push", action="store_true", help="Skip pushing to HuggingFace")
|
|
|
|
| 825 |
args = parser.parse_args()
|
| 826 |
|
| 827 |
start = args.date if isinstance(args.date, datetime) else datetime.strptime(args.date, "%Y-%m-%d")
|
|
@@ -841,17 +848,17 @@ def main():
|
|
| 841 |
workers = min(args.workers, total)
|
| 842 |
|
| 843 |
if total == 1:
|
| 844 |
-
process_date(dates[0], no_push=args.no_push)
|
| 845 |
return
|
| 846 |
|
| 847 |
display = ProgressDisplay(dates, workers)
|
| 848 |
|
| 849 |
if workers <= 1:
|
| 850 |
for date_str in dates:
|
| 851 |
-
process_date(date_str, no_push=args.no_push, display=display)
|
| 852 |
else:
|
| 853 |
with ThreadPoolExecutor(max_workers=workers) as pool:
|
| 854 |
-
futures = {pool.submit(process_date, d, args.no_push, display): d for d in dates}
|
| 855 |
for future in as_completed(futures):
|
| 856 |
try:
|
| 857 |
future.result()
|
|
|
|
| 6 |
uv run python src/daily_retrieve.py --date 2026-03-25 # single day
|
| 7 |
uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 # range
|
| 8 |
uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 --workers 4
|
| 9 |
+
uv run python src/daily_retrieve.py --date 2026-03-25 --recollect # force re-summarize
|
| 10 |
"""
|
| 11 |
|
| 12 |
import argparse
|
|
|
|
| 665 |
print(f"[{date_str}] {msg}", flush=True)
|
| 666 |
|
| 667 |
|
| 668 |
+
def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay | None = None,
|
| 669 |
+
recollect: bool = False) -> str:
|
| 670 |
"""Fetch, summarize, and push papers for a single date. Returns status string."""
|
| 671 |
log = display.update if display else lambda d, done, total, s: _log(d, s)
|
| 672 |
|
|
|
|
| 674 |
output_path = DATA_DIR / f"hf_papers_{date_str}_summarized.json"
|
| 675 |
|
| 676 |
# --- Check if fully done on HF (papers + trending) ---
|
| 677 |
+
if not recollect:
|
| 678 |
+
split = _date_to_split(date_str)
|
| 679 |
+
paper_files = _list_hf_files(HF_DATASET_REPO)
|
| 680 |
+
trending_files = _list_hf_files(HF_TRENDING_REPO)
|
| 681 |
+
has_papers_on_hf = any(split in f for f in paper_files)
|
| 682 |
+
has_trending_on_hf = any(split in f for f in trending_files)
|
| 683 |
+
if has_papers_on_hf and has_trending_on_hf:
|
| 684 |
+
# Verify no retryable errors in HF data
|
| 685 |
+
hf_papers = _pull_papers_from_hf(date_str)
|
| 686 |
+
if hf_papers and all(_paper_is_processed(p) for p in hf_papers):
|
| 687 |
+
log(date_str, 0, 0, "✓ synced")
|
| 688 |
+
return f"{date_str}: synced"
|
| 689 |
|
| 690 |
# --- 1) Always fetch fresh paper list ---
|
| 691 |
log(date_str, 0, 0, "fetching...")
|
|
|
|
| 697 |
total = len(papers)
|
| 698 |
|
| 699 |
# --- 2) Merge cached summaries (local JSON then HF dataset) ---
|
| 700 |
+
if not recollect:
|
| 701 |
+
cached: dict[str, dict] = {}
|
| 702 |
+
if output_path.exists():
|
| 703 |
+
try:
|
| 704 |
+
with open(output_path, encoding="utf-8") as f:
|
| 705 |
+
for p in json.load(f):
|
| 706 |
+
if _paper_is_processed(p):
|
| 707 |
+
cached[p["paper_id"]] = p
|
| 708 |
+
except Exception:
|
| 709 |
+
pass
|
| 710 |
+
if len(cached) < total:
|
| 711 |
+
log(date_str, len(cached), total, "checking HF cache...")
|
| 712 |
+
for p in _pull_papers_from_hf(date_str):
|
| 713 |
+
pid = p.get("paper_id", "")
|
| 714 |
+
if pid and pid not in cached and _paper_is_processed(p):
|
| 715 |
+
cached[pid] = p
|
| 716 |
+
for paper in papers:
|
| 717 |
+
pid = paper.get("paper_id", "")
|
| 718 |
+
if pid in cached:
|
| 719 |
+
paper.update(cached[pid])
|
| 720 |
+
else:
|
| 721 |
+
log(date_str, 0, total, "recollecting (ignoring cache)...")
|
| 722 |
|
| 723 |
# --- 3) Summarize papers not yet processed ---
|
| 724 |
done = sum(1 for p in papers if _paper_is_processed(p))
|
|
|
|
| 763 |
trending = None
|
| 764 |
trending_split = _date_to_split(date_str)
|
| 765 |
trending_files = _list_hf_files(HF_TRENDING_REPO)
|
| 766 |
+
if not recollect and any(trending_split in f for f in trending_files):
|
| 767 |
log(date_str, total, total, "trending cached on HF")
|
| 768 |
else:
|
| 769 |
log(date_str, total, total, "generating trending...")
|
|
|
|
| 828 |
parser.add_argument("--end", type=_parse_date, default=None, help="End date YYYY-MM-DD (inclusive, for range retrieval)")
|
| 829 |
parser.add_argument("--workers", type=int, default=1, help="Number of parallel workers (default: 1)")
|
| 830 |
parser.add_argument("--no-push", action="store_true", help="Skip pushing to HuggingFace")
|
| 831 |
+
parser.add_argument("--recollect", action="store_true", help="Force re-summarize all papers, ignoring cache")
|
| 832 |
args = parser.parse_args()
|
| 833 |
|
| 834 |
start = args.date if isinstance(args.date, datetime) else datetime.strptime(args.date, "%Y-%m-%d")
|
|
|
|
| 848 |
workers = min(args.workers, total)
|
| 849 |
|
| 850 |
if total == 1:
|
| 851 |
+
process_date(dates[0], no_push=args.no_push, recollect=args.recollect)
|
| 852 |
return
|
| 853 |
|
| 854 |
display = ProgressDisplay(dates, workers)
|
| 855 |
|
| 856 |
if workers <= 1:
|
| 857 |
for date_str in dates:
|
| 858 |
+
process_date(date_str, no_push=args.no_push, display=display, recollect=args.recollect)
|
| 859 |
else:
|
| 860 |
with ThreadPoolExecutor(max_workers=workers) as pool:
|
| 861 |
+
futures = {pool.submit(process_date, d, args.no_push, display, args.recollect): d for d in dates}
|
| 862 |
for future in as_completed(futures):
|
| 863 |
try:
|
| 864 |
future.result()
|