Source code for src.terrain.precipitation_downloader

"""
Precipitation data downloader module.

Downloads and manages precipitation data from various sources (PRISM, WorldClim, etc.)
for hydrological analysis and terrain visualization.

Supports:
- PRISM (Parameter-elevation Regressions on Independent Slopes Model)
- WorldClim (global climate data)
- CHELSA (high-resolution climate data)
"""

from pathlib import Path
from typing import Dict, List, Tuple, Union, Optional
import numpy as np
import rasterio
from rasterio import Affine
from rasterio.transform import from_bounds
import requests
import zipfile
import io


# Dataset metadata
DATASETS = {
    "prism": {
        "name": "PRISM Climate Group",
        "resolution": "4km (~30 arc-seconds)",
        "temporal_coverage": "1981-2010 normals",
        "url": "https://prism.oregonstate.edu",
        "description": "High-quality spatial climate data for the US",
        "coverage": "Continental USA only",
    },
    "worldclim": {
        "name": "WorldClim",
        "resolution": "4.5km (2.5 arc-minutes)",
        "temporal_coverage": "1970-2000",
        "url": "https://www.worldclim.org",
        "description": "Global climate data (covers USA, Mexico, and worldwide)",
        "coverage": "Global",
    },
    "worldclim_30s": {
        "name": "WorldClim 30-second",
        "resolution": "1km (30 arc-seconds)",
        "temporal_coverage": "1970-2000",
        "url": "https://www.worldclim.org",
        "description": "High-resolution global climate data (~1km). Download is ~1GB.",
        "coverage": "Global",
    },
    "chelsa": {
        "name": "CHELSA",
        "resolution": "1km (30 arc-seconds)",
        "temporal_coverage": "1979-2013",
        "url": "https://chelsa-climate.org",
        "description": "High-resolution climate data for mountainous regions",
    },
}



[docs]
def download_precipitation(
    bbox: Tuple[float, float, float, float],
    output_dir: str,
    dataset: str = "prism",
    force_download: bool = False,
    use_real_data: bool = True,
) -> Path:
    """
    Download precipitation data for bounding box.

    Parameters
    ----------
    bbox : tuple
        Bounding box (min_lat, min_lon, max_lat, max_lon) in WGS84
    output_dir : str
        Directory to save downloaded data
    dataset : str, default 'prism'
        Dataset to download ('prism', 'worldclim', 'chelsa')
    force_download : bool, default False
        Force re-download even if cached
    use_real_data : bool, default True
        If True, download real data from servers. If False, generate synthetic data for testing.

    Returns
    -------
    Path
        Path to downloaded precipitation GeoTIFF

    Raises
    ------
    ValueError
        If bbox is invalid or dataset not supported
    """
    # Validate bbox
    if not isinstance(bbox, (tuple, list)) or len(bbox) != 4:
        raise ValueError("bbox must be a tuple/list with 4 values (min_lat, min_lon, max_lat, max_lon)")

    min_lat, min_lon, max_lat, max_lon = bbox

    # Validate bbox order
    if min_lat >= max_lat or min_lon >= max_lon:
        raise ValueError(
            f"Invalid bbox: coordinates must be (min_lat, min_lon, max_lat, max_lon). "
            f"Got ({min_lat}, {min_lon}, {max_lat}, {max_lon})"
        )

    # Validate dataset
    if dataset not in DATASETS:
        raise ValueError(f"Unknown dataset: {dataset}. Available: {list(DATASETS.keys())}")

    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Generate output filename
    bbox_str = f"{min_lat}_{min_lon}_{max_lat}_{max_lon}".replace(".", "p").replace("-", "m")
    output_file = output_path / f"{dataset}_annual_precip_{bbox_str}.tif"

    # Check cache
    if output_file.exists() and not force_download:
        print(f"Using cached precipitation data: {output_file}")
        return output_file

    # Download data (dataset-specific)
    if dataset == "prism":
        data, transform = get_prism_annual_precip(bbox, output_dir=str(output_path), use_real_data=use_real_data)
    elif dataset == "worldclim":
        if use_real_data:
            data, transform = download_real_worldclim_annual(bbox, output_dir=str(output_path))
        else:
            data, transform = _create_synthetic_precipitation(bbox)
    elif dataset == "worldclim_30s":
        if use_real_data:
            data, transform = download_real_worldclim_30s_annual(bbox, output_dir=str(output_path))
        else:
            data, transform = _create_synthetic_precipitation(bbox, resolution=1/120)  # 30 arc-seconds
    elif dataset == "chelsa":
        # Stub for CHELSA (would implement actual download)
        data, transform = _create_synthetic_precipitation(bbox)
    else:
        raise ValueError(f"Dataset {dataset} not yet implemented")

    # Save as GeoTIFF
    _write_precipitation_geotiff(output_file, data, transform, crs="EPSG:4326")

    print(f"✓ Downloaded precipitation data to {output_file}")
    return output_file




[docs]
def download_real_worldclim_annual(
    bbox: Tuple[float, float, float, float], output_dir: str
) -> Tuple[np.ndarray, Affine]:
    """
    Download real WorldClim annual precipitation data.

    WorldClim provides global climate data at ~4.5km resolution (2.5 arc-minutes).
    This function downloads BIO12 (annual precipitation) for the specified bbox.

    Uses 2.5m resolution instead of 30s for reasonable download size (~10MB vs 9.7GB).

    Parameters
    ----------
    bbox : tuple
        Bounding box (min_lat, min_lon, max_lat, max_lon)
    output_dir : str
        Output directory

    Returns
    -------
    np.ndarray
        Precipitation data (mm/year)
    Affine
        Geographic transform

    Raises
    ------
    Exception
        If download fails or network is unavailable
    """
    min_lat, min_lon, max_lat, max_lon = bbox

    # WorldClim 2.1 BIO12 (annual precipitation) at 2.5 minutes (~4.5km)
    # Using 2.5m resolution (comparable to PRISM's 4km) - much smaller than 30s!
    # File size: ~5-10MB vs 9.7GB for 30s resolution
    base_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_bio.zip"

    print(f"Downloading WorldClim annual precipitation from {base_url}...")
    print(f"  (2.5 minute resolution, ~10MB, should take <30 seconds)")

    try:
        # Download the ZIP file
        response = requests.get(base_url, timeout=300)  # 5 min timeout for large file
        response.raise_for_status()

        # Extract ZIP contents
        with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
            # Find BIO12 (annual precipitation) TIF file
            bio12_file = None
            for name in zf.namelist():
                if 'bio_12' in name.lower() and name.endswith('.tif'):
                    bio12_file = name
                    break

            if not bio12_file:
                raise ValueError(f"Could not find BIO12 (annual precip) in ZIP. Files: {zf.namelist()}")

            # Extract to temp directory
            output_path = Path(output_dir)
            output_path.mkdir(parents=True, exist_ok=True)

            tif_path = output_path / bio12_file

            with open(tif_path, 'wb') as f:
                f.write(zf.read(bio12_file))

            print(f"✓ Downloaded WorldClim BIO12 to {tif_path}")

            # Read the TIF file using rasterio
            with rasterio.open(tif_path) as src:
                # Read full raster
                full_data = src.read(1).astype(np.float32)
                full_transform = src.transform
                full_crs = src.crs

                # Calculate pixel coordinates for bbox
                from rasterio.transform import rowcol

                # Get row/col for bbox corners
                min_row, min_col = rowcol(full_transform, min_lon, max_lat)  # Upper-left
                max_row, max_col = rowcol(full_transform, max_lon, min_lat)  # Lower-right

                # Clamp to raster bounds
                min_row = max(0, min_row)
                min_col = max(0, min_col)
                max_row = min(full_data.shape[0], max_row)
                max_col = min(full_data.shape[1], max_col)

                # Extract subset
                subset_data = full_data[min_row:max_row, min_col:max_col].copy()

                # Create transform for subset
                subset_transform = full_transform * Affine.translation(min_col, min_row)

                print(f"✓ Extracted subset: {subset_data.shape} pixels")
                print(f"  Precipitation range: {subset_data.min():.1f} to {subset_data.max():.1f} mm/year")

                return subset_data, subset_transform

    except requests.exceptions.RequestException as e:
        raise Exception(f"Network error downloading WorldClim data: {e}") from e
    except Exception as e:
        raise Exception(f"Failed to download WorldClim data: {e}") from e




[docs]
def download_real_worldclim_30s_annual(
    bbox: Tuple[float, float, float, float], output_dir: str
) -> Tuple[np.ndarray, Affine]:
    """
    Download real WorldClim 30-second (~1km) annual precipitation data.

    Downloads monthly precipitation data and sums to annual total.
    File size: ~1GB download for global coverage.

    Parameters
    ----------
    bbox : tuple
        Bounding box (min_lat, min_lon, max_lat, max_lon)
    output_dir : str
        Output directory

    Returns
    -------
    np.ndarray
        Precipitation data (mm/year)
    Affine
        Geographic transform

    Raises
    ------
    Exception
        If download fails or network is unavailable
    """
    min_lat, min_lon, max_lat, max_lon = bbox

    # WorldClim 2.1 monthly precipitation at 30 seconds (~1km)
    # Contains 12 TIF files (one per month), we sum them for annual total
    base_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_30s_prec.zip"

    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Check if we already have the raw data cached
    cache_dir = output_path / "worldclim_30s_raw"
    annual_cache = cache_dir / "wc2.1_30s_prec_annual.tif"

    if annual_cache.exists():
        print(f"Using cached WorldClim 30s annual precipitation: {annual_cache}")
        with rasterio.open(annual_cache) as src:
            return _extract_worldclim_subset(src, bbox)

    print(f"Downloading WorldClim 30-second precipitation from {base_url}...")
    print(f"  (~1km resolution, ~1GB download, may take several minutes)")

    try:
        # Download the ZIP file with progress indication
        response = requests.get(base_url, timeout=1800, stream=True)  # 30 min timeout
        response.raise_for_status()

        total_size = int(response.headers.get('content-length', 0))
        downloaded = 0
        chunks = []

        for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
            chunks.append(chunk)
            downloaded += len(chunk)
            if total_size > 0:
                pct = (downloaded / total_size) * 100
                print(f"\r  Downloaded: {downloaded / 1e6:.1f} MB / {total_size / 1e6:.1f} MB ({pct:.1f}%)", end="", flush=True)
            else:
                print(f"\r  Downloaded: {downloaded / 1e6:.1f} MB", end="", flush=True)

        print()  # newline after progress
        content = b''.join(chunks)

        # Extract ZIP contents
        cache_dir.mkdir(parents=True, exist_ok=True)

        with zipfile.ZipFile(io.BytesIO(content)) as zf:
            # Find all monthly precipitation TIF files
            monthly_files = sorted([
                name for name in zf.namelist()
                if name.endswith('.tif') and 'prec' in name.lower()
            ])

            if len(monthly_files) != 12:
                raise ValueError(f"Expected 12 monthly files, found {len(monthly_files)}: {monthly_files}")

            print(f"✓ Extracting {len(monthly_files)} monthly precipitation files...")

            # Extract all monthly files
            extracted_paths = []
            for name in monthly_files:
                tif_path = cache_dir / Path(name).name
                with open(tif_path, 'wb') as f:
                    f.write(zf.read(name))
                extracted_paths.append(tif_path)

            # Sum monthly to annual
            print("  Summing monthly precipitation to annual total...")
            annual_data = None
            transform = None
            crs = None

            for i, tif_path in enumerate(extracted_paths):
                with rasterio.open(tif_path) as src:
                    monthly_data = src.read(1).astype(np.float32)
                    # Handle nodata (typically -32768 or similar)
                    nodata = src.nodata
                    if nodata is not None:
                        monthly_data = np.where(monthly_data == nodata, 0, monthly_data)

                    if annual_data is None:
                        annual_data = monthly_data.copy()
                        transform = src.transform
                        crs = src.crs
                    else:
                        annual_data += monthly_data

                print(f"\r  Processed month {i + 1}/12", end="", flush=True)

            print()  # newline

            # Save annual total to cache
            print(f"  Caching annual total to {annual_cache}...")
            with rasterio.open(
                annual_cache,
                "w",
                driver="GTiff",
                height=annual_data.shape[0],
                width=annual_data.shape[1],
                count=1,
                dtype=annual_data.dtype,
                crs=crs,
                transform=transform,
                compress="lzw",
            ) as dst:
                dst.write(annual_data, 1)

            # Clean up monthly files to save space
            for tif_path in extracted_paths:
                tif_path.unlink()

            print(f"✓ Created annual precipitation raster: {annual_cache}")
            print(f"  Shape: {annual_data.shape}, Range: {annual_data.min():.1f} - {annual_data.max():.1f} mm/year")

            # Extract subset for bbox
            with rasterio.open(annual_cache) as src:
                return _extract_worldclim_subset(src, bbox)

    except requests.exceptions.RequestException as e:
        raise Exception(f"Network error downloading WorldClim 30s data: {e}") from e
    except Exception as e:
        raise Exception(f"Failed to download WorldClim 30s data: {e}") from e



def _extract_worldclim_subset(
    src: rasterio.DatasetReader, bbox: Tuple[float, float, float, float]
) -> Tuple[np.ndarray, Affine]:
    """
    Extract a subset from a WorldClim raster for the given bbox.

    Parameters
    ----------
    src : rasterio.DatasetReader
        Open rasterio dataset
    bbox : tuple
        Bounding box (min_lat, min_lon, max_lat, max_lon)

    Returns
    -------
    np.ndarray
        Subset precipitation data
    Affine
        Geographic transform for subset
    """
    from rasterio.transform import rowcol

    min_lat, min_lon, max_lat, max_lon = bbox

    full_data = src.read(1).astype(np.float32)
    full_transform = src.transform

    # Get row/col for bbox corners
    min_row, min_col = rowcol(full_transform, min_lon, max_lat)  # Upper-left
    max_row, max_col = rowcol(full_transform, max_lon, min_lat)  # Lower-right

    # Clamp to raster bounds
    min_row = max(0, min_row)
    min_col = max(0, min_col)
    max_row = min(full_data.shape[0], max_row)
    max_col = min(full_data.shape[1], max_col)

    # Extract subset
    subset_data = full_data[min_row:max_row, min_col:max_col].copy()

    # Create transform for subset
    subset_transform = full_transform * Affine.translation(min_col, min_row)

    print(f"✓ Extracted subset: {subset_data.shape} pixels for bbox")
    print(f"  Precipitation range: {subset_data.min():.1f} to {subset_data.max():.1f} mm/year")

    return subset_data, subset_transform



[docs]
def download_real_prism_annual(
    bbox: Tuple[float, float, float, float], output_dir: str
) -> Tuple[np.ndarray, Affine]:
    """
    Download real PRISM annual precipitation data from FTP server.

    IMPORTANT: PRISM only covers continental USA. For areas outside USA
    (e.g., Mexico, Canada), use WorldClim instead via download_real_worldclim_annual().

    Parameters
    ----------
    bbox : tuple
        Bounding box (min_lat, min_lon, max_lat, max_lon)
    output_dir : str
        Output directory

    Returns
    -------
    np.ndarray
        Precipitation data (mm/year)
    Affine
        Geographic transform

    Raises
    ------
    Exception
        If download fails or network is unavailable
    """
    min_lat, min_lon, max_lat, max_lon = bbox

    # PRISM annual normals 1991-2020 (4km resolution)
    # New URL structure as of 2025: https://ftp.prism.oregonstate.edu/normals/us/4km/ppt/monthly/
    base_url = "https://ftp.prism.oregonstate.edu/normals/us/4km/ppt/monthly/prism_ppt_us_25m_2020_avg_30y.zip"

    print(f"Downloading PRISM annual precipitation from {base_url}...")

    try:
        # Download the ZIP file
        response = requests.get(base_url, timeout=60)
        response.raise_for_status()

        # Extract ZIP contents
        with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
            # Find the TIF file (PRISM now uses GeoTIFF format)
            tif_file = None
            for name in zf.namelist():
                if name.endswith('.tif') and not name.endswith('.aux.xml'):
                    tif_file = name
                    break

            if not tif_file:
                raise ValueError(f"Could not find TIF file in ZIP. Files: {zf.namelist()}")

            # Extract TIF file to temp directory
            output_path = Path(output_dir)
            output_path.mkdir(parents=True, exist_ok=True)

            tif_path = output_path / tif_file

            with open(tif_path, 'wb') as f:
                f.write(zf.read(tif_file))

            print(f"✓ Downloaded PRISM data to {tif_path}")

            # Read the TIF file using rasterio
            with rasterio.open(tif_path) as src:
                # Read full raster
                full_data = src.read(1).astype(np.float32)
                full_transform = src.transform
                full_crs = src.crs

                # Calculate pixel coordinates for bbox
                # Transform from geographic to pixel coordinates
                from rasterio.transform import rowcol

                # Get row/col for bbox corners
                min_row, min_col = rowcol(full_transform, min_lon, max_lat)  # Upper-left
                max_row, max_col = rowcol(full_transform, max_lon, min_lat)  # Lower-right

                # Clamp to raster bounds
                min_row = max(0, min_row)
                min_col = max(0, min_col)
                max_row = min(full_data.shape[0], max_row)
                max_col = min(full_data.shape[1], max_col)

                # Extract subset
                subset_data = full_data[min_row:max_row, min_col:max_col].copy()

                # Create transform for subset
                # Transform maps from pixel coordinates to geographic coordinates
                # For a subset starting at (min_row, min_col), we need to adjust the transform
                subset_transform = full_transform * Affine.translation(min_col, min_row)

                return subset_data, subset_transform

    except requests.exceptions.RequestException as e:
        raise Exception(f"Network error downloading PRISM data: {e}") from e
    except Exception as e:
        raise Exception(f"Failed to download PRISM data: {e}") from e




[docs]
def get_prism_annual_precip(
    bbox: Tuple[float, float, float, float], output_dir: str, use_real_data: bool = False
) -> Tuple[np.ndarray, Affine]:
    """
    Download PRISM annual precipitation for bounding box.

    By default, creates synthetic orographic precipitation model for testing.
    Set use_real_data=True to download actual PRISM data from Oregon State servers.

    Parameters
    ----------
    bbox : tuple
        Bounding box (min_lat, min_lon, max_lat, max_lon)
    output_dir : str
        Output directory
    use_real_data : bool, default False
        If True, download real PRISM data. If False, generate synthetic data.

    Returns
    -------
    np.ndarray
        Precipitation data (mm/year)
    Affine
        Geographic transform
    """
    min_lat, min_lon, max_lat, max_lon = bbox

    # Use real PRISM data if requested
    if use_real_data:
        return download_real_prism_annual(bbox, output_dir)

    # Create synthetic precipitation data
    # Resolution: ~4km (PRISM resolution) = ~0.04 degrees
    resolution = 0.04

    # Calculate grid dimensions
    width = int((max_lon - min_lon) / resolution) + 1
    height = int((max_lat - min_lat) / resolution) + 1

    # Create orographic precipitation model
    # Base: 300mm/year at sea level
    # Gradient: wetter in north, drier in south (typical for US West Coast)
    base_precip = 300.0
    lat_gradient = np.linspace(1.5, 0.8, height).reshape(-1, 1)  # Wetter in north
    lon_gradient = np.linspace(0.9, 1.1, width).reshape(1, -1)  # Slight west-east gradient

    # Create precipitation grid
    precip = base_precip * lat_gradient * lon_gradient

    # Add some variability (simulate topographic effects)
    noise = np.random.RandomState(42).normal(1.0, 0.15, size=(height, width))
    precip = precip * noise

    # Clip to reasonable range
    precip = np.clip(precip, 100, 2000)  # 100-2000 mm/year

    # Create geographic transform
    transform = from_bounds(min_lon, min_lat, max_lon, max_lat, width, height)

    return precip.astype(np.float32), transform




[docs]
def validate_precipitation_alignment(
    dem: Optional[np.ndarray],
    precip: Optional[np.ndarray],
    dem_shape: Optional[Tuple[int, int]] = None,
    dem_crs: Optional[str] = None,
    precip_crs: Optional[str] = None,
) -> None:
    """
    Validate that precipitation data aligns with DEM.

    Parameters
    ----------
    dem : np.ndarray, optional
        DEM array
    precip : np.ndarray, optional
        Precipitation array
    dem_shape : tuple, optional
        DEM shape if arrays not provided
    dem_crs : str, optional
        DEM CRS
    precip_crs : str, optional
        Precipitation CRS

    Raises
    ------
    ValueError
        If shapes or CRS don't match
    """
    # Validate shapes
    if dem is not None and precip is not None:
        if dem.shape != precip.shape:
            raise ValueError(
                f"Precipitation shape mismatch: DEM {dem.shape} vs Precip {precip.shape}"
            )
    elif dem_shape is not None and precip is not None:
        if precip.shape != dem_shape:
            raise ValueError(
                f"Precipitation shape mismatch: DEM {dem_shape} vs Precip {precip.shape}"
            )

    # Validate CRS
    if dem_crs is not None and precip_crs is not None:
        if dem_crs != precip_crs:
            raise ValueError(
                f"CRS mismatch: DEM {dem_crs} vs Precipitation {precip_crs}. "
                "Reproject one to match the other."
            )




[docs]
def list_available_datasets(include_metadata: bool = False) -> Union[List[str], Dict]:
    """
    List available precipitation datasets.

    Parameters
    ----------
    include_metadata : bool, default False
        Include metadata for each dataset

    Returns
    -------
    list or dict
        List of dataset names, or dict with metadata if include_metadata=True
    """
    if include_metadata:
        return DATASETS.copy()
    else:
        return list(DATASETS.keys())



def _create_synthetic_precipitation(
    bbox: Tuple[float, float, float, float], resolution: float = 0.04
) -> Tuple[np.ndarray, Affine]:
    """
    Create synthetic precipitation data for testing.

    Parameters
    ----------
    bbox : tuple
        Bounding box (min_lat, min_lon, max_lat, max_lon)
    resolution : float
        Resolution in degrees (default 0.04 = ~4km)

    Returns
    -------
    np.ndarray
        Precipitation data (mm/year)
    Affine
        Geographic transform
    """
    min_lat, min_lon, max_lat, max_lon = bbox

    width = int((max_lon - min_lon) / resolution) + 1
    height = int((max_lat - min_lat) / resolution) + 1

    # Simple gradient model
    precip = np.linspace(400, 800, height * width).reshape(height, width).astype(np.float32)

    transform = from_bounds(min_lon, min_lat, max_lon, max_lat, width, height)

    return precip, transform


def _write_precipitation_geotiff(
    path: Path, data: np.ndarray, transform: Affine, crs: str = "EPSG:4326"
) -> None:
    """
    Write precipitation data to GeoTIFF.

    Parameters
    ----------
    path : Path
        Output file path
    data : np.ndarray
        Precipitation data
    transform : Affine
        Geographic transform
    crs : str
        Coordinate reference system
    """
    height, width = data.shape

    with rasterio.open(
        path,
        "w",
        driver="GTiff",
        height=height,
        width=width,
        count=1,
        dtype=data.dtype,
        crs=crs,
        transform=transform,
        compress="lzw",
    ) as dst:
        dst.write(data, 1)
        # Add metadata
        dst.update_tags(
            1,
            units="mm/year",
            description="Annual precipitation",
        )