"""PortfolioLens — Phase 2 data-pull orchestrator (run ONCE). Pulls the PoC universe from the four free sources, writes immutable raw parquet to ``data/raw//.parquet``, a curated committed snapshot to ``data/snapshot/`` (per-instrument parquet + a wide value panel + a manifest), and prints a pull report. The Quarto chapter reads ONLY ``data/snapshot/`` — it never calls an API. Run this script whenever you want to refresh the vintage: python scripts/poc_pull.py # 2000-01-01 .. snapshot date python scripts/poc_pull.py --start 2010-01-01 API keys (Alpha Vantage, FRED, EIA) come from a gitignored ``.env`` — see ``.env.example``. yfinance and US Treasury need no key, and FRED falls back to a keyless reader, so most of the universe pulls even with no keys at all. """ from __future__ import annotations import argparse import os import sys from datetime import datetime, timezone from pathlib import Path import pandas as pd # make sibling modules importable regardless of CWD sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import poc_fetch # noqa: E402 import poc_universe # noqa: E402 REPO_ROOT = Path(__file__).resolve().parents[1] RAW_DIR = REPO_ROOT / "data" / "raw" SNAPSHOT_DIR = REPO_ROOT / "data" / "snapshot" DEFAULT_START = "2000-01-01" SNAPSHOT_DATE = "2026-06-02" # pinned vintage; also the pull end date def _lib_version(modname: str) -> str: try: return __import__(modname).__version__ except Exception: return "n/a" def run(start: str, end: str, snapshot_dir: Path = SNAPSHOT_DIR) -> pd.DataFrame: keys = poc_fetch.load_keys() have = {k: bool(v) for k, v in keys.items()} print(f"PortfolioLens PoC pull | {start} .. {end}") print(f"API keys present: " + ", ".join(f"{k}={'yes' if v else 'NO'}" for k, v in have.items())) print("-" * 78) RAW_DIR.mkdir(parents=True, exist_ok=True) snapshot_dir.mkdir(parents=True, exist_ok=True) lib_by_source = { "yfinance": _lib_version("yfinance"), "fred": "fredgraph.csv", "alphavantage": "rest-v1", "eia": "rest-v2", "treasury": "rest-v1", } panel_cols: dict[str, pd.Series] = {} manifest_rows, pull_log = [], [] for inst in poc_universe.UNIVERSE: tag = f"[{inst.source:>12}] {inst.symbol:<12}" if inst.needs_key and inst.source != "fred": keyname = {"alphavantage": "ALPHAVANTAGE_API_KEY", "eia": "EIA_API_KEY"}.get(inst.source) if keyname and not keys.get(keyname): print(f"{tag} SKIP — no {keyname}") pull_log.append({"filekey": inst.filekey, "status": "skipped", "detail": f"missing {keyname}"}) continue try: df = poc_fetch.fetch_instrument(inst, start, end, keys) # clip to the pinned window df = df.loc[(df.index >= pd.Timestamp(start)) & (df.index <= pd.Timestamp(end))] if df.empty: raise ValueError("no rows in requested window") (RAW_DIR / inst.source).mkdir(parents=True, exist_ok=True) df.to_parquet(RAW_DIR / inst.source / f"{inst.filekey}.parquet") df.to_parquet(snapshot_dir / f"{inst.filekey}.parquet") panel_cols[inst.filekey] = df["value"] manifest_rows.append({ "filekey": inst.filekey, "symbol": inst.symbol, "name": inst.name, "channel": inst.channel, "source": inst.source, "asset_type": inst.asset_type, "frequency": inst.frequency, "start": df.index.min().date(), "end": df.index.max().date(), "n_rows": len(df), "n_fields": df.shape[1], "pull_ts_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), "lib_version": lib_by_source.get(inst.source, "n/a"), }) pull_log.append({"filekey": inst.filekey, "status": "ok", "detail": f"{len(df)} rows"}) print(f"{tag} OK {len(df):>6} rows {df.index.min().date()} .. {df.index.max().date()}") except Exception as exc: # noqa: BLE001 — log and continue, never abort the run pull_log.append({"filekey": inst.filekey, "status": "error", "detail": str(exc)[:160]}) print(f"{tag} ERROR — {str(exc)[:120]}") # wide value panel (outer join on date) if panel_cols: panel = pd.DataFrame(panel_cols).sort_index() panel.index.name = "date" panel.to_parquet(snapshot_dir / "panel_values.parquet") manifest = pd.DataFrame(manifest_rows) manifest.attrs["snapshot_date"] = SNAPSHOT_DATE manifest.to_csv(snapshot_dir / "manifest.csv", index=False) pd.DataFrame(pull_log).to_csv(snapshot_dir / "pull_log.csv", index=False) ok = sum(1 for r in pull_log if r["status"] == "ok") print("-" * 78) print(f"Snapshot written to {snapshot_dir} | {ok}/{len(poc_universe.UNIVERSE)} instruments OK" f" ({len(pull_log) - ok} skipped/error)") return manifest def main() -> None: ap = argparse.ArgumentParser(description="PortfolioLens Phase-2 PoC data pull") ap.add_argument("--start", default=DEFAULT_START) ap.add_argument("--end", default=SNAPSHOT_DATE) args = ap.parse_args() run(args.start, args.end) if __name__ == "__main__": main()