Source code for swxsoc.io.fillval

"""
Helpers for converting between in-memory NaN/mask representations and
on-disk CDF FILLVAL sentinels.

This module is intentionally CDF-agnostic; it operates on plain numpy arrays
and scalar FILLVAL values so it can be unit-tested in isolation from
:mod:`spacepy.pycdf`.

Conventions
-----------
- Floats round-trip both ``np.nan`` and a parallel boolean ``mask``.
- Integers preserve dtype; the FILLVAL sentinel stays in ``.data`` and the
  mask marks fill positions.
- Strings (``S`` / ``U`` dtypes) use the ISTP single-space sentinel
  (``b" "`` / ``" "``).  As a write-time convenience numpy's coercion of
  ``np.nan`` to the literal bytes ``b"nan"`` (and ``b"NaN"``) is also treated
  as fill.  The reader is strict: only the spec sentinel maps to a mask bit.
- The ISTP FILLVAL values for specific CDF types (including the Epoch
  variants) are owned by :func:`swxsoc.io.fillval.get_fillval`.
"""

from typing import Any

import numpy as np

from swxsoc.util import const

__all__ = [
    "is_float_dtype",
    "is_string_dtype",
    "compute_fill_mask",
    "apply_fill_on_write",
    "apply_fillval_to_nan",
]

# ----------------------------------------------------------------------
# ISTP FILLVAL Sentinel Values
# ----------------------------------------------------------------------


def get_fillval(cdf_type: int):
    """
    Function to return the ISTP FILLVAL sentinel value for a given CDF type.

    Parameters
    ----------
    cdf_type : int
        The CDF type code (numeric) for which to retrieve the FILLVAL.

    Returns
    -------
    scalar
        The FILLVAL sentinel value corresponding to the provided CDF type.
    """

    # Fill value, indexed by the CDF type (numeric)
    fillvals = {}
    # Integers
    for i in (1, 2, 4, 8):
        fillvals[getattr(const, "CDF_INT{}".format(i)).value] = -(2 ** (8 * i - 1))
        if i == 8:
            continue
        fillvals[getattr(const, "CDF_UINT{}".format(i)).value] = 2 ** (8 * i) - 1
    fillvals[const.CDF_EPOCH16.value] = (-1e31, -1e31)
    fillvals[const.CDF_REAL8.value] = -1e31
    fillvals[const.CDF_REAL4.value] = -1e31
    fillvals[const.CDF_CHAR.value] = " "
    fillvals[const.CDF_UCHAR.value] = " "
    # Equivalent pairs
    for cdf_t, equiv in (
        (const.CDF_TIME_TT2000, const.CDF_INT8),
        (const.CDF_EPOCH, const.CDF_REAL8),
        (const.CDF_BYTE, const.CDF_INT1),
        (const.CDF_FLOAT, const.CDF_REAL4),
        (const.CDF_DOUBLE, const.CDF_REAL8),
    ):
        fillvals[cdf_t.value] = fillvals[equiv.value]
    value = fillvals[cdf_type]
    return value


# ----------------------------------------------------------------------
# Dtype checks
# ----------------------------------------------------------------------



[docs]
def is_float_dtype(arr: np.ndarray) -> bool:
    """
    Return True if ``arr`` has a numpy floating-point dtype (real or complex).
    """
    arr = np.asarray(arr)
    return np.issubdtype(arr.dtype, np.floating) or np.issubdtype(
        arr.dtype, np.complexfloating
    )




[docs]
def is_string_dtype(arr: np.ndarray) -> bool:
    """
    Return True if ``arr`` has a numpy byte-string (``S``) or unicode (``U``)
    dtype.
    """
    arr = np.asarray(arr)
    return arr.dtype.kind in ("S", "U")



# ----------------------------------------------------------------------
# Internal helpers
# ----------------------------------------------------------------------


def _coerce_string_value(value: Any, dtype_kind: str) -> Any:
    """
    Normalize ``value`` to ``bytes`` or ``str`` matching ``dtype_kind``.
    """
    if isinstance(value, np.generic):
        value = value.item()
    if dtype_kind == "S":
        if isinstance(value, str):
            return value.encode("utf-8")
        return value
    # dtype_kind == "U"
    if isinstance(value, bytes):
        return value.decode("utf-8")
    return value


def _string_eq(arr: np.ndarray, value: Any) -> np.ndarray:
    """
    Element-wise equality of a string array against a scalar value, with
    byte/unicode normalization so comparisons work regardless of the source
    kind.
    """
    if value is None:
        return np.zeros(arr.shape, dtype=bool)
    coerced = _coerce_string_value(value, arr.dtype.kind)
    return arr == coerced


# ----------------------------------------------------------------------
# Read path: FILLVAL -> mask/NaN
# ----------------------------------------------------------------------



[docs]
def compute_fill_mask(arr: np.ndarray, fillval: Any) -> np.ndarray:
    """
    Compute a boolean mask of fill positions in ``arr``.

    For floats, both ``NaN`` values and exact equality to ``fillval`` count as
    fill.  For integers and strings, only exact equality to ``fillval`` counts.
    The literal bytes ``b"nan"`` are never treated as fill on read.

    Parameters
    ----------
    arr : array-like
        The raw data array as read from the CDF.
    fillval : scalar or None
        The variable's ``FILLVAL`` attribute.  If ``None``, only ``NaN`` is
        detected for float arrays; integer and string arrays yield an
        all-False mask.

    Returns
    -------
    numpy.ndarray
        Boolean array, same shape as ``arr``.
    """
    arr = np.asarray(arr)

    if is_float_dtype(arr):
        # If Float, create mask from combination of NaN and fillval matches.
        mask = np.isnan(arr)
        if fillval is not None:
            mask = mask | (arr == fillval)
        return mask

    if is_string_dtype(arr):
        # If String, create mask from fillval matches only.  Don't treat b"nan" as fill on read.
        return _string_eq(arr, fillval)

    # Integer / other dtypes
    # Don't check for NaN here since Numpy does not support NaN for integer dtypes
    if fillval is None:
        return np.zeros(arr.shape, dtype=bool)

    return arr == fillval




[docs]
def apply_fillval_to_nan(arr: np.ndarray, fillval: Any) -> np.ndarray:
    """
    For float arrays, return a copy of ``arr`` with ``fillval`` positions
    replaced by ``np.nan``.

    Non-float arrays and ``fillval is None`` are no-ops: a copy of ``arr`` is
    returned unchanged.
    """
    arr = np.asarray(arr)
    if fillval is None or not is_float_dtype(arr):
        return arr.copy()
    return np.where(arr == fillval, np.nan, arr)



# ----------------------------------------------------------------------
# Write path: NaN/mask -> FILLVAL
# ----------------------------------------------------------------------



[docs]
def apply_fill_on_write(arr: np.ndarray, mask: np.ndarray, fillval: Any) -> np.ndarray:
    """
    Return a copy of ``arr`` with fill positions replaced by ``fillval``.

    Fill positions are the union of:

    - ``mask`` (when provided),
    - ``NaN`` values (float arrays only),
    - the literal bytes ``b"nan"`` / ``b"NaN"`` (string arrays only;
      write-side convenience for users who pass ``np.nan`` into an ``S`` or
      ``U`` array, which numpy coerces to the bytes ``b"nan"``).

    Parameters
    ----------
    arr : array-like
        Source data.  Not mutated.
    mask : array-like of bool, or None
        Explicit mask of fill positions.  Broadcast-compatible with ``arr``.
    fillval : scalar or None
        The variable's ``FILLVAL`` attribute.  If ``None``, ``arr`` is
        returned as a copy with no replacements.

    Returns
    -------
    numpy.ndarray
        A new array with fill positions replaced.
    """
    arr = np.asarray(arr)
    if fillval is None:
        return arr.copy()

    fill_positions = np.zeros(arr.shape, dtype=bool)
    if mask is not None:
        fill_positions = fill_positions | np.asarray(mask, dtype=bool)

    if is_float_dtype(arr):
        fill_positions = fill_positions | np.isnan(arr)
    elif is_string_dtype(arr):
        # numpy coerces np.nan in an S/U array to the literal bytes b"nan".
        # Treat that (and the title-case variant) as fill on write.
        fill_positions = (
            fill_positions | _string_eq(arr, b"nan") | _string_eq(arr, b"NaN")
        )

    if not fill_positions.any():
        return arr.copy()

    out = arr.copy()
    if is_string_dtype(arr):
        out[fill_positions] = _coerce_string_value(fillval, arr.dtype.kind)
    else:
        out[fill_positions] = fillval
    return out