Elfsong commited on
Commit
dd2e7f2
·
1 Parent(s): 5b1fa09

chore: initialize virtual environment dependencies and project structure

Browse files
Files changed (2) hide show
  1. CIKM-26-Paper-Reader +1 -0
  2. src/daily_retrieve.py +42 -35
CIKM-26-Paper-Reader ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 0436ef63584ada08d02e8129739e263aa4a0ead2
src/daily_retrieve.py CHANGED
@@ -6,6 +6,7 @@ Usage:
6
  uv run python src/daily_retrieve.py --date 2026-03-25 # single day
7
  uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 # range
8
  uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 --workers 4
 
9
  """
10
 
11
  import argparse
@@ -664,7 +665,8 @@ def _log(date_str: str, msg: str):
664
  print(f"[{date_str}] {msg}", flush=True)
665
 
666
 
667
- def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay | None = None) -> str:
 
668
  """Fetch, summarize, and push papers for a single date. Returns status string."""
669
  log = display.update if display else lambda d, done, total, s: _log(d, s)
670
 
@@ -672,17 +674,18 @@ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay
672
  output_path = DATA_DIR / f"hf_papers_{date_str}_summarized.json"
673
 
674
  # --- Check if fully done on HF (papers + trending) ---
675
- split = _date_to_split(date_str)
676
- paper_files = _list_hf_files(HF_DATASET_REPO)
677
- trending_files = _list_hf_files(HF_TRENDING_REPO)
678
- has_papers_on_hf = any(split in f for f in paper_files)
679
- has_trending_on_hf = any(split in f for f in trending_files)
680
- if has_papers_on_hf and has_trending_on_hf:
681
- # Verify no retryable errors in HF data
682
- hf_papers = _pull_papers_from_hf(date_str)
683
- if hf_papers and all(_paper_is_processed(p) for p in hf_papers):
684
- log(date_str, 0, 0, "✓ synced")
685
- return f"{date_str}: synced"
 
686
 
687
  # --- 1) Always fetch fresh paper list ---
688
  log(date_str, 0, 0, "fetching...")
@@ -694,25 +697,28 @@ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay
694
  total = len(papers)
695
 
696
  # --- 2) Merge cached summaries (local JSON then HF dataset) ---
697
- cached: dict[str, dict] = {}
698
- if output_path.exists():
699
- try:
700
- with open(output_path, encoding="utf-8") as f:
701
- for p in json.load(f):
702
- if _paper_is_processed(p):
703
- cached[p["paper_id"]] = p
704
- except Exception:
705
- pass
706
- if len(cached) < total:
707
- log(date_str, len(cached), total, "checking HF cache...")
708
- for p in _pull_papers_from_hf(date_str):
709
- pid = p.get("paper_id", "")
710
- if pid and pid not in cached and _paper_is_processed(p):
711
- cached[pid] = p
712
- for paper in papers:
713
- pid = paper.get("paper_id", "")
714
- if pid in cached:
715
- paper.update(cached[pid])
 
 
 
716
 
717
  # --- 3) Summarize papers not yet processed ---
718
  done = sum(1 for p in papers if _paper_is_processed(p))
@@ -757,7 +763,7 @@ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay
757
  trending = None
758
  trending_split = _date_to_split(date_str)
759
  trending_files = _list_hf_files(HF_TRENDING_REPO)
760
- if any(trending_split in f for f in trending_files):
761
  log(date_str, total, total, "trending cached on HF")
762
  else:
763
  log(date_str, total, total, "generating trending...")
@@ -822,6 +828,7 @@ def main():
822
  parser.add_argument("--end", type=_parse_date, default=None, help="End date YYYY-MM-DD (inclusive, for range retrieval)")
823
  parser.add_argument("--workers", type=int, default=1, help="Number of parallel workers (default: 1)")
824
  parser.add_argument("--no-push", action="store_true", help="Skip pushing to HuggingFace")
 
825
  args = parser.parse_args()
826
 
827
  start = args.date if isinstance(args.date, datetime) else datetime.strptime(args.date, "%Y-%m-%d")
@@ -841,17 +848,17 @@ def main():
841
  workers = min(args.workers, total)
842
 
843
  if total == 1:
844
- process_date(dates[0], no_push=args.no_push)
845
  return
846
 
847
  display = ProgressDisplay(dates, workers)
848
 
849
  if workers <= 1:
850
  for date_str in dates:
851
- process_date(date_str, no_push=args.no_push, display=display)
852
  else:
853
  with ThreadPoolExecutor(max_workers=workers) as pool:
854
- futures = {pool.submit(process_date, d, args.no_push, display): d for d in dates}
855
  for future in as_completed(futures):
856
  try:
857
  future.result()
 
6
  uv run python src/daily_retrieve.py --date 2026-03-25 # single day
7
  uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 # range
8
  uv run python src/daily_retrieve.py --date 2026-03-01 --end 2026-03-31 --workers 4
9
+ uv run python src/daily_retrieve.py --date 2026-03-25 --recollect # force re-summarize
10
  """
11
 
12
  import argparse
 
665
  print(f"[{date_str}] {msg}", flush=True)
666
 
667
 
668
+ def process_date(date_str: str, no_push: bool = False, display: ProgressDisplay | None = None,
669
+ recollect: bool = False) -> str:
670
  """Fetch, summarize, and push papers for a single date. Returns status string."""
671
  log = display.update if display else lambda d, done, total, s: _log(d, s)
672
 
 
674
  output_path = DATA_DIR / f"hf_papers_{date_str}_summarized.json"
675
 
676
  # --- Check if fully done on HF (papers + trending) ---
677
+ if not recollect:
678
+ split = _date_to_split(date_str)
679
+ paper_files = _list_hf_files(HF_DATASET_REPO)
680
+ trending_files = _list_hf_files(HF_TRENDING_REPO)
681
+ has_papers_on_hf = any(split in f for f in paper_files)
682
+ has_trending_on_hf = any(split in f for f in trending_files)
683
+ if has_papers_on_hf and has_trending_on_hf:
684
+ # Verify no retryable errors in HF data
685
+ hf_papers = _pull_papers_from_hf(date_str)
686
+ if hf_papers and all(_paper_is_processed(p) for p in hf_papers):
687
+ log(date_str, 0, 0, " synced")
688
+ return f"{date_str}: synced"
689
 
690
  # --- 1) Always fetch fresh paper list ---
691
  log(date_str, 0, 0, "fetching...")
 
697
  total = len(papers)
698
 
699
  # --- 2) Merge cached summaries (local JSON then HF dataset) ---
700
+ if not recollect:
701
+ cached: dict[str, dict] = {}
702
+ if output_path.exists():
703
+ try:
704
+ with open(output_path, encoding="utf-8") as f:
705
+ for p in json.load(f):
706
+ if _paper_is_processed(p):
707
+ cached[p["paper_id"]] = p
708
+ except Exception:
709
+ pass
710
+ if len(cached) < total:
711
+ log(date_str, len(cached), total, "checking HF cache...")
712
+ for p in _pull_papers_from_hf(date_str):
713
+ pid = p.get("paper_id", "")
714
+ if pid and pid not in cached and _paper_is_processed(p):
715
+ cached[pid] = p
716
+ for paper in papers:
717
+ pid = paper.get("paper_id", "")
718
+ if pid in cached:
719
+ paper.update(cached[pid])
720
+ else:
721
+ log(date_str, 0, total, "recollecting (ignoring cache)...")
722
 
723
  # --- 3) Summarize papers not yet processed ---
724
  done = sum(1 for p in papers if _paper_is_processed(p))
 
763
  trending = None
764
  trending_split = _date_to_split(date_str)
765
  trending_files = _list_hf_files(HF_TRENDING_REPO)
766
+ if not recollect and any(trending_split in f for f in trending_files):
767
  log(date_str, total, total, "trending cached on HF")
768
  else:
769
  log(date_str, total, total, "generating trending...")
 
828
  parser.add_argument("--end", type=_parse_date, default=None, help="End date YYYY-MM-DD (inclusive, for range retrieval)")
829
  parser.add_argument("--workers", type=int, default=1, help="Number of parallel workers (default: 1)")
830
  parser.add_argument("--no-push", action="store_true", help="Skip pushing to HuggingFace")
831
+ parser.add_argument("--recollect", action="store_true", help="Force re-summarize all papers, ignoring cache")
832
  args = parser.parse_args()
833
 
834
  start = args.date if isinstance(args.date, datetime) else datetime.strptime(args.date, "%Y-%m-%d")
 
848
  workers = min(args.workers, total)
849
 
850
  if total == 1:
851
+ process_date(dates[0], no_push=args.no_push, recollect=args.recollect)
852
  return
853
 
854
  display = ProgressDisplay(dates, workers)
855
 
856
  if workers <= 1:
857
  for date_str in dates:
858
+ process_date(date_str, no_push=args.no_push, display=display, recollect=args.recollect)
859
  else:
860
  with ThreadPoolExecutor(max_workers=workers) as pool:
861
+ futures = {pool.submit(process_date, d, args.no_push, display, args.recollect): d for d in dates}
862
  for future in as_completed(futures):
863
  try:
864
  future.result()