"""PortfolioLens — Phase 2 data-pull orchestrator (run ONCE).

Pulls the PoC universe from the four free sources, writes immutable raw parquet
to ``data/raw/<source>/<filekey>.parquet``, a curated committed snapshot to
``data/snapshot/`` (per-instrument parquet + a wide value panel + a manifest),
and prints a pull report.

The Quarto chapter reads ONLY ``data/snapshot/`` — it never calls an API. Run
this script whenever you want to refresh the vintage:

    python scripts/poc_pull.py                 # 2000-01-01 .. snapshot date
    python scripts/poc_pull.py --start 2010-01-01

API keys (Alpha Vantage, FRED, EIA) come from a gitignored ``.env`` — see
``.env.example``. yfinance and US Treasury need no key, and FRED falls back to a
keyless reader, so most of the universe pulls even with no keys at all.
"""

from __future__ import annotations

import argparse
import os
import sys
from datetime import datetime, timezone
from pathlib import Path

import pandas as pd

# make sibling modules importable regardless of CWD
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import poc_fetch  # noqa: E402
import poc_universe  # noqa: E402

REPO_ROOT = Path(__file__).resolve().parents[1]
RAW_DIR = REPO_ROOT / "data" / "raw"
SNAPSHOT_DIR = REPO_ROOT / "data" / "snapshot"

DEFAULT_START = "2000-01-01"
SNAPSHOT_DATE = "2026-06-02"  # pinned vintage; also the pull end date


def _lib_version(modname: str) -> str:
    try:
        return __import__(modname).__version__
    except Exception:
        return "n/a"


def run(start: str, end: str, snapshot_dir: Path = SNAPSHOT_DIR) -> pd.DataFrame:
    keys = poc_fetch.load_keys()
    have = {k: bool(v) for k, v in keys.items()}
    print(f"PortfolioLens PoC pull  |  {start} .. {end}")
    print(f"API keys present: " + ", ".join(f"{k}={'yes' if v else 'NO'}" for k, v in have.items()))
    print("-" * 78)

    RAW_DIR.mkdir(parents=True, exist_ok=True)
    snapshot_dir.mkdir(parents=True, exist_ok=True)

    lib_by_source = {
        "yfinance": _lib_version("yfinance"),
        "fred": "fredgraph.csv",
        "alphavantage": "rest-v1", "eia": "rest-v2", "treasury": "rest-v1",
    }

    panel_cols: dict[str, pd.Series] = {}
    manifest_rows, pull_log = [], []

    for inst in poc_universe.UNIVERSE:
        tag = f"[{inst.source:>12}] {inst.symbol:<12}"
        if inst.needs_key and inst.source != "fred":
            keyname = {"alphavantage": "ALPHAVANTAGE_API_KEY",
                       "eia": "EIA_API_KEY"}.get(inst.source)
            if keyname and not keys.get(keyname):
                print(f"{tag} SKIP — no {keyname}")
                pull_log.append({"filekey": inst.filekey, "status": "skipped",
                                 "detail": f"missing {keyname}"})
                continue
        try:
            df = poc_fetch.fetch_instrument(inst, start, end, keys)
            # clip to the pinned window
            df = df.loc[(df.index >= pd.Timestamp(start)) & (df.index <= pd.Timestamp(end))]
            if df.empty:
                raise ValueError("no rows in requested window")

            (RAW_DIR / inst.source).mkdir(parents=True, exist_ok=True)
            df.to_parquet(RAW_DIR / inst.source / f"{inst.filekey}.parquet")
            df.to_parquet(snapshot_dir / f"{inst.filekey}.parquet")
            panel_cols[inst.filekey] = df["value"]

            manifest_rows.append({
                "filekey": inst.filekey, "symbol": inst.symbol, "name": inst.name,
                "channel": inst.channel, "source": inst.source,
                "asset_type": inst.asset_type, "frequency": inst.frequency,
                "start": df.index.min().date(), "end": df.index.max().date(),
                "n_rows": len(df), "n_fields": df.shape[1],
                "pull_ts_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
                "lib_version": lib_by_source.get(inst.source, "n/a"),
            })
            pull_log.append({"filekey": inst.filekey, "status": "ok", "detail": f"{len(df)} rows"})
            print(f"{tag} OK   {len(df):>6} rows  {df.index.min().date()} .. {df.index.max().date()}")
        except Exception as exc:  # noqa: BLE001 — log and continue, never abort the run
            pull_log.append({"filekey": inst.filekey, "status": "error", "detail": str(exc)[:160]})
            print(f"{tag} ERROR — {str(exc)[:120]}")

    # wide value panel (outer join on date)
    if panel_cols:
        panel = pd.DataFrame(panel_cols).sort_index()
        panel.index.name = "date"
        panel.to_parquet(snapshot_dir / "panel_values.parquet")

    manifest = pd.DataFrame(manifest_rows)
    manifest.attrs["snapshot_date"] = SNAPSHOT_DATE
    manifest.to_csv(snapshot_dir / "manifest.csv", index=False)
    pd.DataFrame(pull_log).to_csv(snapshot_dir / "pull_log.csv", index=False)

    ok = sum(1 for r in pull_log if r["status"] == "ok")
    print("-" * 78)
    print(f"Snapshot written to {snapshot_dir}  |  {ok}/{len(poc_universe.UNIVERSE)} instruments OK"
          f"  ({len(pull_log) - ok} skipped/error)")
    return manifest


def main() -> None:
    ap = argparse.ArgumentParser(description="PortfolioLens Phase-2 PoC data pull")
    ap.add_argument("--start", default=DEFAULT_START)
    ap.add_argument("--end", default=SNAPSHOT_DATE)
    args = ap.parse_args()
    run(args.start, args.end)


if __name__ == "__main__":
    main()
