"""PortfolioLens — Phase 2 per-source fetchers (free data sources only).

Each fetcher returns a tidy ``pandas.DataFrame`` with:
  * a ``DatetimeIndex`` named ``date`` (tz-naive, ascending), and
  * at least a ``value`` column (the representative level used in exploration),
    plus OHLCV columns where the source provides them.

Design notes
------------
* yfinance and US Treasury fiscaldata need **no** API key.
* FRED works with a key (``fredapi``) and falls back to keyless
  ``pandas_datareader`` when no key is present — so the PoC produces real macro
  data even before you register a FRED key.
* Alpha Vantage and EIA strictly require keys; without them the orchestrator
  skips those instruments and logs the reason (it never aborts the whole run).

These are thin, dependency-light wrappers — intentionally easy to read and to
reuse in later CRISP-DM phases.
"""

from __future__ import annotations

import os
import time
from typing import Optional

import pandas as pd
import requests


# ─────────────────────────────────────────────────────────────────────────────
# helpers
# ─────────────────────────────────────────────────────────────────────────────
def _tidy(df: pd.DataFrame, value_col: str) -> pd.DataFrame:
    """Normalize index → tz-naive ascending DatetimeIndex named 'date', add 'value'."""
    df = df.copy()
    df.index = pd.to_datetime(df.index)
    try:
        df.index = df.index.tz_localize(None)  # drop tz if tz-aware
    except (TypeError, AttributeError):
        pass
    df.index.name = "date"
    df = df[~df.index.duplicated(keep="last")].sort_index()
    if value_col in df.columns and "value" not in df.columns:
        df["value"] = pd.to_numeric(df[value_col], errors="coerce")
    return df


# ─────────────────────────────────────────────────────────────────────────────
# yfinance — equities, ETFs, indices, crypto (no key)
# ─────────────────────────────────────────────────────────────────────────────
def fetch_yfinance(symbol: str, start: str, end: Optional[str] = None) -> pd.DataFrame:
    import yfinance as yf

    raw = yf.download(symbol, start=start, end=end, auto_adjust=True,
                      progress=False, actions=False)
    if raw is None or raw.empty:
        raise ValueError(f"yfinance returned no data for {symbol!r}")

    # Recent yfinance returns MultiIndex columns even for a single ticker.
    if isinstance(raw.columns, pd.MultiIndex):
        raw.columns = raw.columns.get_level_values(0)

    raw = raw.rename(columns=str.title)  # Open/High/Low/Close/Volume
    df = _tidy(raw, value_col="Close")
    keep = [c for c in ["Open", "High", "Low", "Close", "Volume", "value"] if c in df.columns]
    return df[keep]


# ─────────────────────────────────────────────────────────────────────────────
# FRED — macro / rates / commodity spots (key, or keyless fallback)
# ─────────────────────────────────────────────────────────────────────────────
def fetch_fred(series_id: str, start: str, end: Optional[str] = None,
               api_key: Optional[str] = None) -> pd.DataFrame:
    if api_key:
        from fredapi import Fred
        s = Fred(api_key=api_key).get_series(
            series_id, observation_start=start, observation_end=end)
        df = s.to_frame(name="value")
        return _tidy(df, value_col="value")

    # Keyless path — FRED's public fredgraph.csv endpoint. Preferred for the bulk
    # pull: it returns FULL history and is not subject to the API key's per-key
    # rate limit / windowing that truncated long series. (pandas-datareader is
    # avoided — it is incompatible with pandas 3.x.) The FRED API key is reserved
    # for ALFRED point-in-time/vintage pulls in Phase 3.
    import io
    url = "https://fred.stlouisfed.org/graph/fredgraph.csv"
    params = {"id": series_id}
    if start:
        params["cosd"] = start[:10]
    if end:
        params["coed"] = end[:10]
    resp = None
    last_exc = None
    for attempt in range(3):
        time.sleep(0.5 * (attempt + 1))  # polite throttle + backoff (avoid 429s)
        try:
            resp = requests.get(url, params=params, timeout=30)
            resp.raise_for_status()
            break
        except requests.HTTPError as exc:
            last_exc = exc
            if resp is not None and resp.status_code in (429, 503):
                continue
            raise
    else:
        raise RuntimeError(f"FRED CSV {series_id} failed after retries: {last_exc}")
    raw = pd.read_csv(io.StringIO(resp.text), na_values=["."])
    if raw.shape[1] < 2:
        raise RuntimeError(f"FRED CSV for {series_id!r} had unexpected shape {raw.shape}")
    raw = raw.rename(columns={raw.columns[0]: "date", raw.columns[1]: "value"})
    raw["value"] = pd.to_numeric(raw["value"], errors="coerce")
    return _tidy(raw.set_index("date")[["value"]], value_col="value")


# ─────────────────────────────────────────────────────────────────────────────
# Alpha Vantage — FX (key required; sparing use)
# ─────────────────────────────────────────────────────────────────────────────
def fetch_alphavantage_fx(from_symbol: str, to_symbol: str, api_key: str,
                          outputsize: str = "full") -> pd.DataFrame:
    if not api_key:
        raise ValueError("Alpha Vantage requires ALPHAVANTAGE_API_KEY")
    url = "https://www.alphavantage.co/query"
    params = {"function": "FX_DAILY", "from_symbol": from_symbol,
              "to_symbol": to_symbol, "outputsize": outputsize, "apikey": api_key}
    resp = requests.get(url, params=params, timeout=30)
    resp.raise_for_status()
    payload = resp.json()
    key = "Time Series FX (Daily)"
    if key not in payload:
        # AV returns {"Note": ...} on throttle, {"Information": ...} on bad key
        msg = payload.get("Note") or payload.get("Information") or payload.get("Error Message") or str(payload)[:200]
        raise RuntimeError(f"Alpha Vantage FX_DAILY failed: {msg}")
    ts = payload[key]
    df = pd.DataFrame(ts).T
    df = df.rename(columns={"1. open": "Open", "2. high": "High",
                            "3. low": "Low", "4. close": "Close"})
    for c in ["Open", "High", "Low", "Close"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return _tidy(df, value_col="Close")[["Open", "High", "Low", "Close", "value"]]


# ─────────────────────────────────────────────────────────────────────────────
# EIA v2 — petroleum series (key required)
# ─────────────────────────────────────────────────────────────────────────────
def fetch_eia_series(series_id: str, api_key: str,
                     start: Optional[str] = None) -> pd.DataFrame:
    if not api_key:
        raise ValueError("EIA requires EIA_API_KEY")
    url = f"https://api.eia.gov/v2/seriesid/{series_id}"
    params = {"api_key": api_key}
    if start:
        params["start"] = start[:10]
    resp = requests.get(url, params=params, timeout=30)
    resp.raise_for_status()
    rows = resp.json().get("response", {}).get("data", [])
    if not rows:
        raise RuntimeError(f"EIA returned no data for {series_id!r}")
    df = pd.DataFrame(rows)
    df = df.rename(columns={"period": "date", "value": "value"})
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df = df.set_index("date")
    return _tidy(df[["value"]], value_col="value")


# ─────────────────────────────────────────────────────────────────────────────
# US Treasury fiscaldata — Reporting Rates of Exchange (keyless)
# ─────────────────────────────────────────────────────────────────────────────
def fetch_treasury_exchange_rate(currency_contains: str = "Euro",
                                 start: Optional[str] = None) -> pd.DataFrame:
    base = ("https://api.fiscaldata.treasury.gov/services/api/fiscal_service"
            "/v1/accounting/od/rates_of_exchange")
    params = {
        "fields": "record_date,country_currency_desc,exchange_rate",
        "filter": f"country_currency_desc:in:(Euro Zone-Euro)",
        "page[size]": "10000",
        "sort": "record_date",
    }
    if start:
        params["filter"] += f",record_date:gte:{start[:10]}"
    resp = requests.get(base, params=params, timeout=30)
    resp.raise_for_status()
    rows = resp.json().get("data", [])
    if not rows:
        raise RuntimeError("Treasury fiscaldata returned no exchange-rate rows")
    df = pd.DataFrame(rows)
    df["value"] = pd.to_numeric(df["exchange_rate"], errors="coerce")
    df = df.rename(columns={"record_date": "date"}).set_index("date")
    return _tidy(df[["value"]], value_col="value")


# ─────────────────────────────────────────────────────────────────────────────
# dispatch
# ─────────────────────────────────────────────────────────────────────────────
def fetch_instrument(inst, start: str, end: Optional[str], keys: dict,
                     av_sleep: float = 12.0) -> pd.DataFrame:
    """Route an Instrument to the right fetcher. Raises on failure (caller logs)."""
    src = inst.source
    if src == "yfinance":
        return fetch_yfinance(inst.symbol, start, end)
    if src == "fred":
        # Force the keyless CSV path: full history, no per-key rate limit/truncation.
        return fetch_fred(inst.symbol, start, end, api_key=None)
    if src == "alphavantage":
        frm, to = inst.symbol.split("/")
        df = fetch_alphavantage_fx(frm, to, api_key=keys.get("ALPHAVANTAGE_API_KEY", ""))
        time.sleep(av_sleep)  # respect 5 requests/minute
        return df
    if src == "eia":
        return fetch_eia_series(inst.symbol, api_key=keys.get("EIA_API_KEY", ""), start=start)
    if src == "treasury":
        return fetch_treasury_exchange_rate(start=start)
    raise ValueError(f"Unknown source: {src!r}")


def load_keys() -> dict:
    """Load API keys from .env (if python-dotenv present) + environment."""
    try:
        from dotenv import load_dotenv, find_dotenv
        load_dotenv(find_dotenv(usecwd=True))
    except Exception:
        pass
    return {k: os.getenv(k, "") for k in
            ("ALPHAVANTAGE_API_KEY", "FRED_API_KEY", "EIA_API_KEY")}
