Spaces:

Elfsong
/

Paper_Espresso

Running

App Files Files Community

Elfsong commited on 17 days ago

Commit

dd2e7f2

1 Parent(s): 5b1fa09

chore: initialize virtual environment dependencies and project structure

Browse files

Files changed (2) hide show

CIKM-26-Paper-Reader +1 -0
src/daily_retrieve.py +42 -35

CIKM-26-Paper-Reader ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 0436ef63584ada08d02e8129739e263aa4a0ead2

src/daily_retrieve.py CHANGED Viewed

@@ -6,6 +6,7 @@ Usage:
     uv run python src/daily_retrieve.py --date 2026-03-25                # single day
     uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31  # range
     uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 --workers 4
 """
 import argparse
@@ -664,7 +665,8 @@ def _log(date_str: str, msg: str):
         print(f"[{date_str}] {msg}", flush=True)
-def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay | None = None) -> str:
     """Fetch, summarize, and push papers for a single date. Returns status string."""
     log = display.update if display else lambda d, done, total, s: _log(d, s)
@@ -672,17 +674,18 @@ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay
     output_path = DATA_DIR / f"hf_papers_{date_str}_summarized.json"
     # --- Check if fully done on HF (papers + trending) ---
-    split = _date_to_split(date_str)
-    paper_files = _list_hf_files(HF_DATASET_REPO)
-    trending_files = _list_hf_files(HF_TRENDING_REPO)
-    has_papers_on_hf = any(split in f for f in paper_files)
-    has_trending_on_hf = any(split in f for f in trending_files)
-    if has_papers_on_hf and has_trending_on_hf:
-        # Verify no retryable errors in HF data
-        hf_papers = _pull_papers_from_hf(date_str)
-        if hf_papers and all(_paper_is_processed(p) for p in hf_papers):
-            log(date_str, 0, 0, "✓ synced")
-            return f"{date_str}: synced"
     # --- 1) Always fetch fresh paper list ---
     log(date_str, 0, 0, "fetching...")
@@ -694,25 +697,28 @@ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay
     total = len(papers)
     # --- 2) Merge cached summaries (local JSON then HF dataset) ---
-    cached: dict[str, dict] = {}
-    if output_path.exists():
-        try:
-            with open(output_path, encoding="utf-8") as f:
-                for p in json.load(f):
-                    if _paper_is_processed(p):
-                        cached[p["paper_id"]] = p
-        except Exception:
-            pass
-    if len(cached) < total:
-        log(date_str, len(cached), total, "checking HF cache...")
-        for p in _pull_papers_from_hf(date_str):
-            pid = p.get("paper_id", "")
-            if pid and pid not in cached and _paper_is_processed(p):
-                cached[pid] = p
-    for paper in papers:
-        pid = paper.get("paper_id", "")
-        if pid in cached:
-            paper.update(cached[pid])
     # --- 3) Summarize papers not yet processed ---
     done = sum(1 for p in papers if _paper_is_processed(p))
@@ -757,7 +763,7 @@ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay
     trending = None
     trending_split = _date_to_split(date_str)
     trending_files = _list_hf_files(HF_TRENDING_REPO)
-    if any(trending_split in f for f in trending_files):
         log(date_str, total, total, "trending cached on HF")
     else:
         log(date_str, total, total, "generating trending...")
@@ -822,6 +828,7 @@ def main():
     parser.add_argument("--end", type=_parse_date, default=None, help="End date YYYY-MM-DD (inclusive, for range retrieval)")
     parser.add_argument("--workers", type=int, default=1, help="Number of parallel workers (default: 1)")
     parser.add_argument("--no-push", action="store_true", help="Skip pushing to HuggingFace")
     args = parser.parse_args()
     start = args.date if isinstance(args.date, datetime) else datetime.strptime(args.date, "%Y-%m-%d")
@@ -841,17 +848,17 @@ def main():
     workers = min(args.workers, total)
     if total == 1:
-        process_date(dates[0], no_push=args.no_push)
         return
     display = ProgressDisplay(dates, workers)
     if workers <= 1:
         for date_str in dates:
-            process_date(date_str, no_push=args.no_push, display=display)
     else:
         with ThreadPoolExecutor(max_workers=workers) as pool:
-            futures = {pool.submit(process_date, d, args.no_push, display): d for d in dates}
             for future in as_completed(futures):
                 try:
                     future.result()

     uv run python src/daily_retrieve.py --date 2026-03-25                # single day
     uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31  # range
     uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 --workers 4
+    uv run python src/daily_retrieve.py --date 2026-03-25 --recollect          # force re-summarize
 """
 import argparse
         print(f"[{date_str}] {msg}", flush=True)
+def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay | None = None,
+                  recollect: bool = False) -> str:
     """Fetch, summarize, and push papers for a single date. Returns status string."""
     log = display.update if display else lambda d, done, total, s: _log(d, s)
     output_path = DATA_DIR / f"hf_papers_{date_str}_summarized.json"
     # --- Check if fully done on HF (papers + trending) ---
+    if not recollect:
+        split = _date_to_split(date_str)
+        paper_files = _list_hf_files(HF_DATASET_REPO)
+        trending_files = _list_hf_files(HF_TRENDING_REPO)
+        has_papers_on_hf = any(split in f for f in paper_files)
+        has_trending_on_hf = any(split in f for f in trending_files)
+        if has_papers_on_hf and has_trending_on_hf:
+            # Verify no retryable errors in HF data
+            hf_papers = _pull_papers_from_hf(date_str)
+            if hf_papers and all(_paper_is_processed(p) for p in hf_papers):
+                log(date_str, 0, 0, "✓ synced")
+                return f"{date_str}: synced"
     # --- 1) Always fetch fresh paper list ---
     log(date_str, 0, 0, "fetching...")
     total = len(papers)
     # --- 2) Merge cached summaries (local JSON then HF dataset) ---
+    if not recollect:
+        cached: dict[str, dict] = {}
+        if output_path.exists():
+            try:
+                with open(output_path, encoding="utf-8") as f:
+                    for p in json.load(f):
+                        if _paper_is_processed(p):
+                            cached[p["paper_id"]] = p
+            except Exception:
+                pass
+        if len(cached) < total:
+            log(date_str, len(cached), total, "checking HF cache...")
+            for p in _pull_papers_from_hf(date_str):
+                pid = p.get("paper_id", "")
+                if pid and pid not in cached and _paper_is_processed(p):
+                    cached[pid] = p
+        for paper in papers:
+            pid = paper.get("paper_id", "")
+            if pid in cached:
+                paper.update(cached[pid])
+    else:
+        log(date_str, 0, total, "recollecting (ignoring cache)...")
     # --- 3) Summarize papers not yet processed ---
     done = sum(1 for p in papers if _paper_is_processed(p))
     trending = None
     trending_split = _date_to_split(date_str)
     trending_files = _list_hf_files(HF_TRENDING_REPO)
+    if not recollect and any(trending_split in f for f in trending_files):
         log(date_str, total, total, "trending cached on HF")
     else:
         log(date_str, total, total, "generating trending...")
     parser.add_argument("--end", type=_parse_date, default=None, help="End date YYYY-MM-DD (inclusive, for range retrieval)")
     parser.add_argument("--workers", type=int, default=1, help="Number of parallel workers (default: 1)")
     parser.add_argument("--no-push", action="store_true", help="Skip pushing to HuggingFace")
+    parser.add_argument("--recollect", action="store_true", help="Force re-summarize all papers, ignoring cache")
     args = parser.parse_args()
     start = args.date if isinstance(args.date, datetime) else datetime.strptime(args.date, "%Y-%m-%d")
     workers = min(args.workers, total)
     if total == 1:
+        process_date(dates[0], no_push=args.no_push, recollect=args.recollect)
         return
     display = ProgressDisplay(dates, workers)
     if workers <= 1:
         for date_str in dates:
+            process_date(date_str, no_push=args.no_push, display=display, recollect=args.recollect)
     else:
         with ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = {pool.submit(process_date, d, args.no_push, display, args.recollect): d for d in dates}
             for future in as_completed(futures):
                 try:
                     future.result()