First pass at factorization model.

Fix some minor bugs. Output factorization. Partial latent space mixture model implementation. Use mixture model when factoring. Add genewise offset parameters. Attempts to debug the factorization model. First attempt at new poisson matrix factorization model. Purge old stuff from the NB model. Sampling r Fix bug in multinomial sampling. A failed attempt at inducing sparsity. Revert "A failed attempt at inducing sparsity." This reverts commit 57b59dc. Getting mixture model on phi working. Tweak beta prior. Gradual switch to gamma-gamma prior. Sampling phi and theta. Sampling r using the crt trick. Sampling s theta Sampling s phi Remove some dead code, fix a bug. Solve collapsing components by using NB marginal. Fix forgetting to sqrt variance. Fix kmeans initialization getting lost. Include missing prior means on s params. Lots of debugging junk. Figure out reasonable priors, delete some diagnostic shit. Optimization Tinkering with priors. Minor optimization. Add arguments to set some hyperparameters. Fix up clumsy rebase. Remove unecessary layer dimension from foreground_counts array. Faster sort on transcript repo. Remove unused counts array. PCA before Kmeans when initializing components assignments Remove some dead code. Debugging and tinkering with initialization. A possible implementation of 'effective volume' parameters. Different effective volume scheme, disabled for now. Add argument to exclude genes with regex, with xenium defaults. Dirichlet distributed theta, which seems to get better results. Also fixing phi dispersion parameter for now. Not ideal, but seems to get the best results. Option to write factorization parameters. Pin sprs at version 0.11.1 until compilation problems are resolved. See: rust-ml/linfa#307 Fix compile errors from rebase. Quantile transformation of z coordinates. Revert "Quantile transformation of z coordinates." This reverts commit cc39ad0223cb62ad50be0000860f61e07fb6af9b. Add volume term when sampling z, fix log_effective_volume not updating with --no-cell-scales Implement partial factorization scheme. Use more unfactored genes by default. Fix rebase, bump version. Remove some more debugging code. Replace some problematic dependencies.
dcjones · Jan 23, 2025 · 83f032a · 83f032a
1 parent 03ef7eb
commit 83f032a
Show file tree

Hide file tree

Showing 7 changed files with 1,229 additions and 874 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "proseg"
 description = "Probabilistic cell segmentation for in situ spatial transcriptomics"
-version = "1.1.9"
+version = "2.0.0"
 edition = "2021"
 authors = ["Daniel C. Jones <[email protected]>"]
 repository = "https://github.com/dcjones/proseg"
@@ -38,15 +38,16 @@ itertools = "0.12.1"
 json = "0.12.4"
 kiddo = "4.2.0"
 libm = "0.2.7"
-linfa = "0.7.0"
-linfa-clustering = "0.7.0"
-ndarray = { version = "0.15.6", features = ["rayon"] }
-ndarray-conv = "0.2.0"
+ndarray = { version = "0.16.1", features = ["rayon", "matrixmultiply-threading"] }
 num-traits = "0.2.17"
 numeric_literals = "0.2.0"
 parquet = "52.2.0"
 petgraph = "0.6.3"
 rand = "0.8.5"
 rand_distr = "0.4.3"
 rayon = "1.7.0"
+regex = "1.10.6"
 thread_local = "1.1.7"
+faer = "0.19.4"
+faer-ext = { version = "0.3.0", features = ["ndarray"] }
+clustering = { version = "0.2.1", features = ["parallel"] }
diff --git a/src/main.rs b/src/main.rs
@@ -13,6 +13,7 @@ use hull::convex_hull_area;
 use indicatif::{ProgressBar, ProgressStyle};
 use itertools::Itertools;
 use rayon::current_num_threads;
+use regex::Regex;
 use sampler::transcripts::{
     coordinate_span, estimate_full_area, filter_cellfree_transcripts, read_transcripts_csv,
     CellIndex, Transcript, BACKGROUND_CELL,
@@ -61,6 +62,10 @@ struct Args {
     #[arg(long, default_value_t = false)]
     merfish: bool,
 
+    /// Regex pattern matching names of genes/features to be excluded
+    #[arg(long, default_value = None)]
+    excluded_genes: Option<String>,
+
     /// Initialize with cell assignments rather than nucleus assignments
     #[arg(long, default_value_t = false)]
     use_cell_initialization: bool,
@@ -123,7 +128,7 @@ struct Args {
 
     /// Ignore the z coordinate if any, treating the data as 2D
     #[arg(long, default_value_t = false)]
-    no_z_coord: bool,
+    ignore_z_coord: bool,
 
     /// Filter out transcripts with quality values below this threshold
     #[arg(long, default_value_t = 0.0_f32)]
@@ -139,6 +144,10 @@ struct Args {
     #[arg(long, default_value_t = 10)]
     ncomponents: usize,
 
+    /// Dimenionality of the latent space
+    #[arg(long, default_value_t = 100)]
+    nhidden: usize,
+
     /// Number of z-axis layers used to model background expression
     #[arg(long, default_value_t = 4)]
     nbglayers: usize,
@@ -257,10 +266,9 @@ struct Args {
     #[arg(long, value_enum, default_value_t = OutputFormat::Infer)]
     output_rates_fmt: OutputFormat,
 
-    /// Output per-component parameter values
-    #[arg(long, default_value = None)]
-    output_component_params: Option<String>,
-
+    // /// Output per-component parameter values
+    // #[arg(long, default_value = None)]
+    // output_component_params: Option<String>,
     #[arg(long, value_enum, default_value_t = OutputFormat::Infer)]
     output_component_params_fmt: OutputFormat,
 
@@ -292,6 +300,20 @@ struct Args {
     #[arg(long, value_enum, default_value_t = OutputFormat::Infer)]
     output_gene_metadata_fmt: OutputFormat,
 
+    /// Output cell metagene rates
+    #[arg(long, default_value=None)]
+    output_metagene_rates: Option<String>,
+
+    #[arg(long, value_enum, default_value_t = OutputFormat::Infer)]
+    output_metagene_rates_fmt: OutputFormat,
+
+    /// Output metagene loadings
+    #[arg(long, default_value=None)]
+    output_metagene_loadings: Option<String>,
+
+    #[arg(long, value_enum, default_value_t = OutputFormat::Infer)]
+    output_metagene_loadings_fmt: OutputFormat,
+
     /// Output a table of each voxel in each cell
     #[arg(long, default_value=None)]
     output_cell_voxels: Option<String>,
@@ -323,6 +345,30 @@ struct Args {
     /// Use connectivity checks to prevent cells from having any disconnected voxels
     #[arg(long, default_value_t = true)]
     enforce_connectivity: bool,
+
+    #[arg(long, default_value_t = 300)]
+    nunfactored: usize,
+
+    /// Disable factorization model and use genes directly
+    #[arg(long, default_value_t = false)]
+    no_factorization: bool,
+
+    /// Disable cell scale factors
+    #[arg(long, default_value_t = false)]
+    no_cell_scales: bool,
+
+    // Hyperparameters
+    #[arg(long, default_value_t = 1.0)]
+    hyperparam_e_phi: f32,
+
+    #[arg(long, default_value_t = 1.0)]
+    hyperparam_f_phi: f32,
+
+    #[arg(long, default_value_t = 1.0)]
+    hyperparam_neg_mu_phi: f32,
+
+    #[arg(long, default_value_t = 0.1)]
+    hyperparam_tau_phi: f32,
 }
 
 fn set_xenium_presets(args: &mut Args) {
@@ -339,6 +385,9 @@ fn set_xenium_presets(args: &mut Args) {
     args.cell_id_unassigned
         .get_or_insert(String::from("UNASSIGNED"));
     args.qv_column.get_or_insert(String::from("qv"));
+    args.excluded_genes.get_or_insert(String::from(
+        "^(Deprecated|NegControl|Unassigned|Intergenic)",
+    ));
 
     // newer xenium data does have a fov column
     args.fov_column.get_or_insert(String::from("fov_name"));
@@ -406,9 +455,10 @@ fn set_visiumhd_presets(args: &mut Args) {
     args.z_column.get_or_insert(String::from("z")); // ignored
     args.cell_id_column.get_or_insert(String::from("cell"));
     args.cell_id_unassigned.get_or_insert(String::from("0"));
-    args.initial_voxel_size = Some(4.0);
+    args.initial_voxel_size = Some(1.0);
     args.voxel_layers = 1;
-    args.no_z_coord = true;
+    args.nbglayers = 1;
+    args.ignore_z_coord = true;
 
     // TODO: This is the resolution on the one dataset I have. It probably
     // doesn't generalize.
@@ -506,8 +556,11 @@ fn main() {
     mut cell_assignments,
     mut nucleus_population) = */
 
+    let excluded_genes = args.excluded_genes.map(|pat| Regex::new(&pat).unwrap());
+
     let mut dataset = read_transcripts_csv(
         &args.transcript_csv,
+        excluded_genes,
         &expect_arg(args.gene_column, "gene-column"),
         args.transcript_id_column,
         args.compartment_column,
@@ -522,10 +575,22 @@ fn main() {
         &expect_arg(args.y_column, "y-column"),
         &expect_arg(args.z_column, "z-column"),
         args.min_qv,
-        args.no_z_coord,
+        args.ignore_z_coord,
         args.coordinate_scale.unwrap_or(1.0),
     );
 
+    if !args.no_factorization {
+        dataset.select_unfactored_genes(args.nunfactored);
+    }
+
+    // let cd3e_idx = dataset
+    //     .transcript_names
+    //     .iter()
+    //     .position(|gene| gene == "CXCL6")
+    //     .unwrap();
+    // dbg!(cd3e_idx);
+    // panic!();
+
     // Warn if any nucleus has extremely high population, which is likely
     // an error interpreting the file.
     dataset.nucleus_population.iter().for_each(|&p| {
@@ -654,19 +719,35 @@ fn main() {
             Some(args.burnin_dispersion)
         },
 
+        use_cell_scales: !args.no_cell_scales,
+
         min_cell_volume,
 
         μ_μ_volume: (2.0 * mean_nucleus_area * zspan).ln(),
         σ_μ_volume: 3.0_f32,
         α_σ_volume: 0.1,
         β_σ_volume: 0.1,
 
-        e_r: 1.0,
+        use_factorization: !args.no_factorization,
+
+        // TODO: mean/var ratio is always 1/fφ, but that doesn't seem like the whole
+        // story. Seems like it needs to change as a function of the dimensionality
+        // of the latent space.
 
-        e_h: 1.0,
-        f_h: 1.0,
+        // I also don't know if this "severe prior" approach is going to work in
+        // the long run because we may have far more cells. Needs more testing.
+        // eφ: 1000.0,
+        // fφ: 1.0,
 
-        γ: 1.0,
+        // μφ: -20.0,
+        // τφ: 0.1,
+        αθ: 1e-1,
+
+        eφ: args.hyperparam_e_phi,
+        fφ: args.hyperparam_f_phi,
+
+        μφ: -args.hyperparam_neg_mu_phi,
+        τφ: args.hyperparam_tau_phi,
 
         α_bg: 1.0,
         β_bg: 1.0,
@@ -692,6 +773,8 @@ fn main() {
         σ_z_diffusion_proposal: 0.2 * zspan,
         σ_z_diffusion: 0.2 * zspan,
 
+        τv: 10.0,
+
         zmin,
         zmax,
 
@@ -709,6 +792,8 @@ fn main() {
         &dataset.nucleus_population,
         &dataset.cell_assignments,
         args.ncomponents,
+        args.nhidden,
+        args.nunfactored,
         args.nbglayers,
         ncells,
         ngenes,
@@ -736,7 +821,9 @@ fn main() {
         initial_voxel_size,
         chunk_size,
     ));
-    sampler.borrow_mut().initialize(&priors, &mut params);
+    sampler
+        .borrow_mut()
+        .initialize(&priors, &mut params, &dataset.transcripts);
 
     let mut total_steps = 0;
 
@@ -856,12 +943,12 @@ fn main() {
         &params,
         &dataset.transcript_names,
     );
-    write_component_params(
-        &args.output_component_params,
-        args.output_component_params_fmt,
-        &params,
-        &dataset.transcript_names,
-    );
+    // write_component_params(
+    //     &args.output_component_params,
+    //     args.output_component_params_fmt,
+    //     &params,
+    //     &dataset.transcript_names,
+    // );
     write_cell_metadata(
         &args.output_cell_metadata,
         args.output_cell_metadata_fmt,
@@ -890,6 +977,17 @@ fn main() {
         &dataset.transcript_names,
         &ecounts,
     );
+    write_metagene_rates(
+        &args.output_metagene_rates,
+        args.output_metagene_rates_fmt,
+        &params.φ,
+    );
+    write_metagene_loadings(
+        &args.output_metagene_loadings,
+        args.output_metagene_loadings_fmt,
+        &dataset.transcript_names,
+        &params.θ,
+    );
     write_voxels(
         &args.output_cell_voxels,
         args.output_cell_voxels_fmt,
@@ -935,23 +1033,25 @@ fn run_hexbin_sampler(
     for _ in 0..niter {
         // sampler.check_perimeter_bounds(priors);
 
+        // let t0 = std::time::Instant::now();
         if sample_cell_regions {
             // let t0 = std::time::Instant::now();
             for _ in 0..local_steps_per_iter {
                 sampler.sample_cell_regions(
                     priors,
                     params,
                     &mut proposal_stats,
-                    transcripts,
                     hillclimb,
                     &mut uncertainty,
                 );
             }
             // println!("Sample cell regions: {:?}", t0.elapsed());
         }
+        // println!("Sample cell regions: {:?}", t0.elapsed());
+
         // let t0 = std::time::Instant::now();
         sampler.sample_global_params(priors, params, transcripts, &mut uncertainty, burnin);
-        // println!("Sample parameters: {:?}", t0.elapsed());
+        // println!("Sample global parameters: {:?}", t0.elapsed());
 
         let nassigned = params.nassigned();
         let nforeground = params.nforeground();