Source code for cc_mapping.utils

"""
Utility functions for the cc_mapping package.
"""

from __future__ import annotations

from typing import List

import anndata as ad
import numpy as np
import pandas as pd


[docs] def create_boolean_label_combination( adata: ad.AnnData, obs_key_1: str, match_values_1: List[str], obs_key_2: str, match_values_2: List[str], operator: str, output_obs_key: str, true_label: str, false_label: str, overwrite: bool = False, ) -> ad.AnnData: """ Combine two categorical observation columns using boolean operators. Creates a new binary label based on whether cells match specified values in both input observation columns, using the specified boolean operator. Parameters ---------- adata : ad.AnnData AnnData object with observations to combine. obs_key_1 : str First observation column name in adata.obs. match_values_1 : List[str] Values in obs_key_1 to match (considered "true" for boolean logic). obs_key_2 : str Second observation column name in adata.obs. match_values_2 : List[str] Values in obs_key_2 to match (considered "true" for boolean logic). operator : str Boolean operator - 'AND', 'OR', or 'XOR'. output_obs_key : str Name for new combined observation column. true_label : str Label for cells matching the boolean criteria. false_label : str Label for cells not matching the boolean criteria. overwrite : bool, default False If True, overwrites existing output_key. If False, raises error if output_key exists. Returns ------- ad.AnnData Modified AnnData object with new observation column. Raises ------ KeyError If obs_key_1 or obs_key_2 don't exist in adata.obs. ValueError If operator is not 'AND', 'OR', or 'XOR'. KeyError If output_obs_key already exists in adata.obs and overwrite=False. TypeError If match_values_1 or match_values_2 are not lists. ValueError If any values in match_values_1 not found in obs_key_1. ValueError If any values in match_values_2 not found in obs_key_2. Examples -------- AND: Both conditions must be true >>> adata = create_boolean_label_combination( ... adata, ... obs_key_1='treatment', ... match_values_1=['control'], ... obs_key_2='cell_cycle', ... match_values_2=['G0'], ... operator='AND', ... output_obs_key='control_G0', ... true_label='control_G0', ... false_label='other' ... ) OR: Either condition true >>> adata = create_boolean_label_combination( ... adata, ... obs_key_1='treatment', ... match_values_1=['control', 'vehicle'], ... obs_key_2='cell_cycle', ... match_values_2=['G0', 'G1'], ... operator='OR', ... output_obs_key='quiescent_or_control', ... true_label='positive', ... false_label='other' ... ) XOR: Exactly one condition true (exclusive or) >>> adata = create_boolean_label_combination( ... adata, ... obs_key_1='marker_A', ... match_values_1=['positive'], ... obs_key_2='marker_B', ... match_values_2=['positive'], ... operator='XOR', ... output_obs_key='single_positive', ... true_label='single_positive', ... false_label='double_or_negative' ... ) """ # Validate observation columns exist if obs_key_1 not in adata.obs.columns: raise KeyError( f"obs_key_1 '{obs_key_1}' not found in adata.obs. " f"Available columns: {list(adata.obs.columns)}" ) if obs_key_2 not in adata.obs.columns: raise KeyError( f"obs_key_2 '{obs_key_2}' not found in adata.obs. " f"Available columns: {list(adata.obs.columns)}" ) # Validate operator valid_operators = ['AND', 'OR', 'XOR'] operator = operator.upper() if operator not in valid_operators: raise ValueError( f"operator must be one of {valid_operators}, got '{operator}'" ) # Validate output_obs_key doesn't already exist (unless overwrite=True) if output_obs_key in adata.obs.columns and not overwrite: raise KeyError( f"output_obs_key '{output_obs_key}' already exists in adata.obs. " "Set overwrite=True to replace it, or choose a different name." ) # Validate match values are lists if not isinstance(match_values_1, list): raise TypeError( f"match_values_1 must be a list, got {type(match_values_1)}" ) if not isinstance(match_values_2, list): raise TypeError( f"match_values_2 must be a list, got {type(match_values_2)}" ) # Validate all values exist in their respective observation columns unique_obs_1 = set(adata.obs[obs_key_1].unique()) for val in match_values_1: if val not in unique_obs_1: raise ValueError( f"Value '{val}' not found in obs_key_1 '{obs_key_1}'. " f"Available values: {sorted(unique_obs_1)}" ) unique_obs_2 = set(adata.obs[obs_key_2].unique()) for val in match_values_2: if val not in unique_obs_2: raise ValueError( f"Value '{val}' not found in obs_key_2 '{obs_key_2}'. " f"Available values: {sorted(unique_obs_2)}" ) # Create boolean masks mask1 = adata.obs[obs_key_1].isin(match_values_1) mask2 = adata.obs[obs_key_2].isin(match_values_2) # Apply boolean operator if operator == 'AND': final_mask = mask1 & mask2 elif operator == 'OR': final_mask = mask1 | mask2 elif operator == 'XOR': final_mask = mask1 ^ mask2 # Create new categorical column new_labels = np.where(final_mask, true_label, false_label) adata.obs[output_obs_key] = pd.Categorical(new_labels) return adata