lactationcurve.preprocessing

 1from .validate_and_standardize import (
 2    PreparedInputs,
 3    standardize_lactation_columns,
 4    validate_and_prepare_inputs,
 5)
 6
 7__all__ = [
 8    "PreparedInputs",
 9    "standardize_lactation_columns",
10    "validate_and_prepare_inputs",
11]
@dataclass
class PreparedInputs:
42@dataclass
43class PreparedInputs:
44    """Normalized, ready‑to‑fit inputs.
45
46    This container is returned by `validate_and_prepare_inputs` and is the single
47    hand‑off object expected by the fitting routines. Arrays are finite and 1‑dimensional;
48    categorical fields are lower/upper‑cased as appropriate and may be `None` if omitted.
49
50    Attributes:
51        dim: 1D NumPy array of day‑in‑milk values (finite; same length as `milkrecordings`).
52        milkrecordings: 1D NumPy array of test‑day milk yields aligned to `dim`.
53        model: Lowercased model identifier or `None` if not provided.
54        fitting: `"frequentist"` or `"bayesian"` (lowercased) or `None`.
55        breed: `"H"` or `"J"` or `None`.
56        parity: Lactation number as `int`, if provided; otherwise `None`.
57        continent: Prior source flag for Bayesian flows (`"USA"`, `"EU"`, `"CHEN"`), or `None`.
58        persistency_method: Either `"derived"` or `"literature"`, or `None`.
59        lactation_length: Integer horizon (e.g., 305), the string `"max"`, or `None`.
60    """
61    dim: np.ndarray
62    milkrecordings: np.ndarray
63    model: str | None = None
64    fitting: str | None = None
65    breed: str | None = None
66    parity: int | None = None
67    continent: str | None = None
68    persistency_method: str | None = None
69    lactation_length: int | str | None = None

Normalized, ready‑to‑fit inputs.

This container is returned by validate_and_prepare_inputs and is the single hand‑off object expected by the fitting routines. Arrays are finite and 1‑dimensional; categorical fields are lower/upper‑cased as appropriate and may be None if omitted.

Attributes:
  • dim: 1D NumPy array of day‑in‑milk values (finite; same length as milkrecordings).
  • milkrecordings: 1D NumPy array of test‑day milk yields aligned to dim.
  • model: Lowercased model identifier or None if not provided.
  • fitting: "frequentist" or "bayesian" (lowercased) or None.
  • breed: "H" or "J" or None.
  • parity: Lactation number as int, if provided; otherwise None.
  • continent: Prior source flag for Bayesian flows ("USA", "EU", "CHEN"), or None.
  • persistency_method: Either "derived" or "literature", or None.
  • lactation_length: Integer horizon (e.g., 305), the string "max", or None.
PreparedInputs( dim: numpy.ndarray, milkrecordings: numpy.ndarray, model: str | None = None, fitting: str | None = None, breed: str | None = None, parity: int | None = None, continent: str | None = None, persistency_method: str | None = None, lactation_length: int | str | None = None)
dim: numpy.ndarray
milkrecordings: numpy.ndarray
model: str | None = None
fitting: str | None = None
breed: str | None = None
parity: int | None = None
continent: str | None = None
persistency_method: str | None = None
lactation_length: int | str | None = None
def standardize_lactation_columns( df: pandas.DataFrame, *, days_in_milk_col: str | None = None, milking_yield_col: str | None = None, test_id_col: str | None = None, default_test_id=0, max_dim: int = 305):
195def standardize_lactation_columns(
196    df: pd.DataFrame,
197    *,
198    days_in_milk_col: str | None = None,
199    milking_yield_col: str | None = None,
200    test_id_col: str | None = None,
201    default_test_id=0,
202    max_dim: int = 305,
203):
204    """
205    Standardize column names and structure for lactation data.
206
207    Returns
208    -------
209    df_out : pd.DataFrame
210        Copy of df with standardized columns:
211        - DaysInMilk
212        - MilkingYield
213        - TestId
214    """
215
216    df = df.copy()
217
218    # Accepted aliases (case-insensitive)
219    aliases = {
220        "DaysInMilk": ["daysinmilk", "dim", "testday"],
221        "MilkingYield": [
222            "milkingyield",
223            "testdaymilkyield",
224            "milkyield",
225            "yield",
226            "milkproduction",
227            "milk_yield",
228        ],
229        "TestId": ["testid", "animalid", "id"],
230    }
231
232    # Lowercase lookup → actual column name
233    col_lookup = {col.lower(): col for col in df.columns}
234
235    def resolve_col(override, possible_names):
236        if override:
237            return col_lookup.get(override.lower())
238        for name in possible_names:
239            if name in col_lookup:
240                return col_lookup[name]
241        return None
242
243    # Resolve columns
244    dim_col = resolve_col(days_in_milk_col, aliases["DaysInMilk"])
245    if not dim_col:
246        raise ValueError("No DaysInMilk column found.")
247
248    yield_col = resolve_col(milking_yield_col, aliases["MilkingYield"])
249    if not yield_col:
250        raise ValueError("No MilkingYield column found.")
251
252    id_col = resolve_col(test_id_col, aliases["TestId"])
253
254    # Create TestId if missing
255    if not id_col:
256        df["TestId"] = default_test_id
257        id_col = "TestId"
258
259    # Rename to standardized names
260    df = df.rename(
261        columns={
262            dim_col: "DaysInMilk",
263            yield_col: "MilkingYield",
264            id_col: "TestId",
265        }
266    )
267
268    # Filter DIM
269    df = df[df["DaysInMilk"] <= max_dim]
270
271    return df

Standardize column names and structure for lactation data.

Returns

df_out : pd.DataFrame Copy of df with standardized columns: - DaysInMilk - MilkingYield - TestId

def validate_and_prepare_inputs( dim, milkrecordings, model=None, fitting=None, *, breed=None, parity=None, continent=None, persistency_method=None, lactation_length=None) -> PreparedInputs:
 72def validate_and_prepare_inputs(
 73    dim,
 74    milkrecordings,
 75    model=None,
 76    fitting=None,
 77    *,
 78    breed=None,
 79    parity=None,
 80    continent=None,
 81    persistency_method=None,
 82    lactation_length=None,
 83) -> PreparedInputs:
 84    """
 85    Validate, normalize, and clean input data for lactation curve fitting.
 86
 87    This function performs basic consistency checks on the provided
 88    days-in-milk (DIM) and milk recording data, normalizes optional
 89    categorical parameters, and removes observations with missing or
 90    non-finite values. The cleaned and validated inputs are returned
 91    in a structured :class:`PreparedInputs` object.
 92
 93    Parameters
 94    ----------
 95    dim : array-like
 96        Days in milk corresponding to each milk recording.
 97    milkrecordings : array-like
 98        Milk yield measurements corresponding to `dim`.
 99    model : str or None, optional
100        Name of the lactation curve model. If provided, the name is
101        stripped of whitespace and converted to lowercase.
102    fitting : str or None, optional
103        Fitting approach to be used. Must be either ``"frequentist"``
104        or ``"bayesian"`` if provided.
105    breed : str or None, optional
106        Cow breed identifier. Must be ``"H"`` (Holstein) or ``"J"``
107        (Jersey) if provided. Case-insensitive.
108    parity : int or None, optional
109        Lactation number (parity). If provided, it is coerced to an
110        integer.
111    continent : str or None, optional
112        Geographic region identifier. Must be one of ``"USA"``,
113        ``"EU"``, or ``"CHEN"`` if provided. Case-insensitive.
114
115    Extra input for persistency calculation:
116        persistency_method (String): way of calculating persistency, options: 'derived' which gives the average slope of the lactation after the peak until the end of lactation (default) or 'literature' for the wood and milkbot model.
117        Lactation_length: string or int: length of the lactation in days to calculate persistency over, options: 305 = default or 'max'  uses the maximum DIM in the data, or an integer value to set the desired lactation length.
118
119    Returns
120    -------
121    PreparedInputs
122        A dataclass containing the cleaned numeric arrays (`dim`,
123        `milkrecordings`) and the normalized optional parameters.
124
125    Raises
126    ------
127    ValueError
128        If input arrays have different lengths, contain insufficient
129        valid observations, or if categorical parameters are invalid.
130
131    Notes
132    -----
133    Observations with missing or non-finite values in either `dim` or
134    `milkrecordings` are removed prior to model fitting. At least two
135    valid observations are required to proceed.
136    """
137    if len(dim) != len(milkrecordings):
138        raise ValueError("dim and milkrecordings must have the same length")
139
140    model = (model or "").strip().lower()
141
142    if parity is not None:
143        parity = int(parity)
144
145    if fitting is not None:
146        fitting = fitting.lower()
147        if fitting not in {"frequentist", "bayesian"}:
148            raise ValueError("Fitting method must be either frequentist or bayesian")
149
150    if breed is not None:
151        breed = breed.upper()
152        if breed not in {"H", "J"}:
153            raise ValueError("Breed must be either Holstein = 'H' or Jersey 'J'")
154
155    if continent is not None:
156        continent = continent.upper()
157        if continent not in {"USA", "EU", "CHEN"}:
158            raise ValueError("continent must be 'USA', 'EU', or 'CHEN'")
159
160    dim = np.asarray(dim, dtype=float)
161    milkrecordings = np.asarray(milkrecordings, dtype=float)
162
163    mask = np.isfinite(dim) & np.isfinite(milkrecordings)
164    dim = dim[mask]
165    milkrecordings = milkrecordings[mask]
166
167    if len(dim) < 2:
168        raise ValueError("At least two non missing points are required to fit a lactation curve")
169
170    if persistency_method is not None:
171        persistency_method = persistency_method.lower()
172        if persistency_method not in {"derived", "literature"}:
173            raise ValueError("persistency_method must be either 'derived' or 'literature'")
174
175    if lactation_length is not None:
176        if isinstance(lactation_length, str):
177            if lactation_length.lower() != "max":
178                raise ValueError("lactation_length string option must be 'max'")
179        else:
180            lactation_length = int(lactation_length)
181
182    return PreparedInputs(
183        dim=dim,
184        milkrecordings=milkrecordings,
185        model=model or None,
186        fitting=fitting,
187        breed=breed,
188        parity=parity,
189        continent=continent,
190        persistency_method=persistency_method,
191        lactation_length=lactation_length,
192    )

Validate, normalize, and clean input data for lactation curve fitting.

This function performs basic consistency checks on the provided days-in-milk (DIM) and milk recording data, normalizes optional categorical parameters, and removes observations with missing or non-finite values. The cleaned and validated inputs are returned in a structured PreparedInputs object.

Parameters

dim : array-like Days in milk corresponding to each milk recording. milkrecordings : array-like Milk yield measurements corresponding to dim. model : str or None, optional Name of the lactation curve model. If provided, the name is stripped of whitespace and converted to lowercase. fitting : str or None, optional Fitting approach to be used. Must be either "frequentist" or "bayesian" if provided. breed : str or None, optional Cow breed identifier. Must be "H" (Holstein) or "J" (Jersey) if provided. Case-insensitive. parity : int or None, optional Lactation number (parity). If provided, it is coerced to an integer. continent : str or None, optional Geographic region identifier. Must be one of "USA", "EU", or "CHEN" if provided. Case-insensitive.

Extra input for persistency calculation:

persistency_method (String): way of calculating persistency, options: 'derived' which gives the average slope of the lactation after the peak until the end of lactation (default) or 'literature' for the wood and milkbot model. Lactation_length: string or int: length of the lactation in days to calculate persistency over, options: 305 = default or 'max' uses the maximum DIM in the data, or an integer value to set the desired lactation length.

Returns

PreparedInputs A dataclass containing the cleaned numeric arrays (dim, milkrecordings) and the normalized optional parameters.

Raises

ValueError If input arrays have different lengths, contain insufficient valid observations, or if categorical parameters are invalid.

Notes

Observations with missing or non-finite values in either dim or milkrecordings are removed prior to model fitting. At least two valid observations are required to proceed.