{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Gaussian Mixture Model Thresholding\n", "\n", "This notebook demonstrates the `GMMThresholding` class from the `cc_mapping` package, which provides a flexible, semi-supervised approach to categorizing samples based on continuous features using Gaussian Mixture Models (GMMs).\n", "\n", "## Key Features\n", "\n", "- **Automatic threshold determination**: Fit GMMs and automatically calculate decision boundaries\n", "- **Manual threshold override**: Specify custom thresholds while leveraging GMM visualization\n", "- **Label collapsing**: Fit many GMM components for adaptive boundaries, then collapse to fewer categories\n", "- **Exploratory visualization**: Explore data distributions before committing to thresholding\n", "- **Comprehensive visualization**: Multiple plotting methods to understand your data and thresholds" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import all required libraries\n", "from pathlib import Path\n", "from urllib.request import urlretrieve\n", "\n", "import anndata as ad\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "\n", "from cc_mapping.thresholding import GMMThresholding\n", "from cc_mapping.utils import create_boolean_label_combination\n", "\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "## Setup and Data Loading" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "url = \"https://zenodo.org/records/4525425/files/control_manifold_allfeatures.csv?download=1\"\n", "\n", "cwd = Path.cwd()\n", "\n", "download_path = cwd / \"data\" / \"control_manifold_allfeatures.csv\"\n", "\n", "if not download_path.parent.exists():\n", " download_path.parent.mkdir(parents=True)\n", " urlretrieve(url, download_path)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Results directory created at: c:\\Users\\dap182\\Documents\\git\\cc_mapping\\notebooks\\results\n", "Single threshold results directory: c:\\Users\\dap182\\Documents\\git\\cc_mapping\\notebooks\\results\\single\n" ] } ], "source": [ "# Create results directory structure for saving figures\n", "results_dir = cwd / \"results\"\n", "single_results_dir = results_dir / \"single\"\n", "\n", "# Create directories if they don't exist\n", "results_dir.mkdir(exist_ok=True)\n", "single_results_dir.mkdir(exist_ok=True)\n", "\n", "print(f\"Results directory created at: {results_dir}\")\n", "print(f\"Single threshold results directory: {single_results_dir}\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | E2F1 (nuc median) | \n", "cycA (nuc median) | \n", "cycD1 (nuc median) | \n", "p21 (nuc median) | \n", "Int_Intg_DNA_nuc | \n", "Skp2 (nuc median) | \n", "Cdt1 (nuc median) | \n", "Nuc area | \n", "Cdh1 (nuc median) | \n", "cycE (nuc median) | \n", "... | \n", "STAT3 (phospho/total nuc) | \n", "age | \n", "phase | \n", "PHATE_1 | \n", "PHATE_2 | \n", "PCNA foci | \n", "DNA content | \n", "Local cell density | \n", "annotated age | \n", "annotated phase | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Unnamed: 0.1.1 | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
| 0 | \n", "0.006851 | \n", "0.005875 | \n", "0.019471 | \n", "0.026032 | \n", "5.746899 | \n", "0.006493 | \n", "0.005158 | \n", "553.0 | \n", "0.014336 | \n", "0.009262 | \n", "... | \n", "1.770186 | \n", "10.263173 | \n", "G1 | \n", "-0.020654 | \n", "0.007907 | \n", "0.005151 | \n", "2.298759 | \n", "7.0 | \n", "NaN | \n", "NaN | \n", "
| 1 | \n", "0.012360 | \n", "0.028153 | \n", "0.007462 | \n", "0.004318 | \n", "8.852262 | \n", "0.022797 | \n", "0.005951 | \n", "490.0 | \n", "0.015229 | \n", "0.008392 | \n", "... | \n", "1.455814 | \n", "12.109644 | \n", "S | \n", "0.025945 | \n", "-0.000735 | \n", "0.006861 | \n", "3.540905 | \n", "1.0 | \n", "NaN | \n", "NaN | \n", "
| 2 | \n", "0.007279 | \n", "0.005707 | \n", "0.006592 | \n", "0.003632 | \n", "4.951003 | \n", "0.015366 | \n", "0.004929 | \n", "363.0 | \n", "0.005730 | \n", "0.009117 | \n", "... | \n", "1.290323 | \n", "4.954907 | \n", "G1 | \n", "0.002961 | \n", "-0.004575 | \n", "0.004148 | \n", "1.980401 | \n", "6.0 | \n", "NaN | \n", "NaN | \n", "
| 3 | \n", "0.006531 | \n", "0.016602 | \n", "0.009369 | \n", "0.006264 | \n", "10.466743 | \n", "0.019196 | \n", "0.005234 | \n", "579.0 | \n", "0.009361 | \n", "0.007118 | \n", "... | \n", "1.435065 | \n", "13.424587 | \n", "G2 | \n", "0.034884 | \n", "0.002944 | \n", "0.002457 | \n", "4.186697 | \n", "9.0 | \n", "NaN | \n", "NaN | \n", "
| 5 | \n", "0.006744 | \n", "0.006027 | \n", "0.009033 | \n", "0.005295 | \n", "5.249119 | \n", "0.010422 | \n", "0.007797 | \n", "296.0 | \n", "0.008896 | \n", "0.007767 | \n", "... | \n", "1.266304 | \n", "1.828240 | \n", "G1 | \n", "-0.019039 | \n", "0.000584 | \n", "0.003277 | \n", "2.099648 | \n", "3.0 | \n", "NaN | \n", "NaN | \n", "
5 rows × 299 columns
\n", "| \n", " | Operation | \n", "Type | \n", "Feature | \n", "Layer | \n", "Obs Label | \n", "Components | \n", "Thresholds | \n", "Labels | \n", "Parent | \n", "Refined From | \n", "Total Cells | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "1. cycD1 (nuc median) | \n", "standard | \n", "cycD1 (nuc median) | \n", "None | \n", "Low/High DNA Content | \n", "2 | \n", "0.0191 | \n", "Low, High | \n", "None | \n", "N/A | \n", "6797 | \n", "