This repository has been archived by the owner on Jun 11, 2023. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpubs.bib
142 lines (134 loc) · 16.2 KB
/
pubs.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
@misc{bridgeford2022-BatchEffects,
title = {Batch {{Effects}} Are {{Causal Effects}}: {{Applications}} in {{Human Connectomics}}},
shorttitle = {Batch {{Effects}} Are {{Causal Effects}}},
author = {Bridgeford, Eric W. and Powell, Michael and Kiar, Gregory and Noble, Stephanie and Chung, Jaewon and Panda, Sambit and Lawrence, Ross and Xu, Ting and Milham, Michael and Caffo, Brian and Vogelstein, Joshua T.},
year = {2022},
month = oct,
pages = {2021.09.03.458920},
publisher = {{bioRxiv}},
doi = {10.1101/2021.09.03.458920},
abstract = {Batch effects, undesirable sources of variance across multiple experiments, present a substantial hurdle for scientific and clinical discoveries. Specifically, the presence of batch effects can create both spurious discoveries and hide veridical signals, contributing to the ongoing reproducibility crisis. Typical approaches to dealing with batch effects conceptualize ‘batches’ as an associational effect, rather than a causal effect, despite the fact that the sources of variance that comprise the batch – potentially including experimental design and population demographics – causally impact downstream inferences. We therefore cast batch effects as a causal problem rather than an associational problem. This reformulation enables us to make explicit the assumptions and limitations of existing approaches for dealing with batch effects. We therefore develop causal batch effect strategies – Causal Dcorr for discovery of batch effects and Causal ComBat for mitigating batch effects – which build upon existing statistical associational methods by incorporating modern causal inference techniques. We apply these strategies to a large mega-study of human connectomes assembled by the Consortium for Reliability and Reproducibility, consisting of 24 batches including over 1700 individuals to illustrate that existing approaches create more spurious discoveries (false positives) and miss more veridical signals (true positives) than our proposed approaches. Our work therefore introduces a conceptual framing, as well as open source code, for combining multiple distinct datasets to increase confidence in claims of scientific and clinical discoveries.},
chapter = {New Results},
copyright = {© 2022, Posted by Cold Spring Harbor Laboratory. This pre-print is available under a Creative Commons License (Attribution-NonCommercial-NoDerivs 4.0 International), CC BY-NC-ND 4.0, as described at http://creativecommons.org/licenses/by-nc-nd/4.0/},
langid = {english}
}
@phdthesis{panda2020-MultivariateIndependence,
type = {Thesis},
title = {Multivariate {{Independence}} and K-Sample {{Testing}}},
author = {Panda, Sambit},
year = {2020},
month = may,
abstract = {With the increase in the amount of data in many fields, a method to consistently and efficiently decipher relationships within high dimensional data sets is important. Because many modern datasets are multivariate, univariate tests are not applicable. While many multivariate independence tests have R packages available, the interfaces are inconsistent and most are not available in Python. We introduce hyppo, which includes many state of the art multivariate testing procedures. This thesis provides details for the implementations of each of the tests within a test hyppo as well as extensive power and run-time benchmarks on a suite of high-dimensional simulations previously used in different publications. The documentation and all releases for hyppo are available at https://hyppo.neurodata.io.},
copyright = {All rights reserved},
langid = {american},
school = {Johns Hopkins University},
annotation = {Accepted: 2020-06-21T20:37:42Z}
}
@misc{panda2021-HyppoMultivariate,
title = {Hyppo: {{A Multivariate Hypothesis Testing Python Package}}},
shorttitle = {Hyppo},
author = {Panda, Sambit and Palaniappan, Satish and Xiong, Junhao and Bridgeford, Eric W. and Mehta, Ronak and Shen, Cencheng and Vogelstein, Joshua T.},
year = {2021},
month = apr,
number = {arXiv:1907.02088},
eprint = {1907.02088},
eprinttype = {arxiv},
primaryclass = {cs, stat},
publisher = {{arXiv}},
doi = {10.48550/arXiv.1907.02088},
abstract = {We introduce hyppo, a unified library for performing multivariate hypothesis testing, including independence, two-sample, and k-sample testing. While many multivariate independence tests have R packages available, the interfaces are inconsistent and most are not available in Python. hyppo includes many state of the art multivariate testing procedures. The package is easy-to-use and is flexible enough to enable future extensions. The documentation and all releases are available at https://hyppo.neurodata.io.},
archiveprefix = {arXiv},
copyright = {All rights reserved}
}
@misc{panda2021-NonparMANOVA,
title = {Nonpar {{MANOVA}} via {{Independence Testing}}},
author = {Panda, Sambit and Shen, Cencheng and Perry, Ronan and Zorn, Jelle and Lutz, Antoine and Priebe, Carey E. and Vogelstein, Joshua T.},
year = {2021},
month = apr,
number = {arXiv:1910.08883},
eprint = {1910.08883},
eprinttype = {arxiv},
primaryclass = {cs, stat},
publisher = {{arXiv}},
doi = {10.48550/arXiv.1910.08883},
abstract = {The \$k\$-sample testing problem tests whether or not \$k\$ groups of data points are sampled from the same distribution. Multivariate analysis of variance (MANOVA) is currently the gold standard for \$k\$-sample testing but makes strong, often inappropriate, parametric assumptions. Moreover, independence testing and \$k\$-sample testing are tightly related, and there are many nonparametric multivariate independence tests with strong theoretical and empirical properties, including distance correlation (Dcorr) and Hilbert-Schmidt-Independence-Criterion (Hsic). We prove that universally consistent independence tests achieve universally consistent \$k\$-sample testing and that \$k\$-sample statistics like Energy and Maximum Mean Discrepancy (MMD) are exactly equivalent to Dcorr. Empirically evaluating these tests for \$k\$-sample scenarios demonstrates that these nonparametric independence tests typically outperform MANOVA, even for Gaussian distributed settings. Finally, we extend these non-parametric \$k\$-sample testing procedures to perform multiway and multilevel tests. Thus, we illustrate the existence of many theoretically motivated and empirically performant \$k\$-sample tests. A Python package with all independence and k-sample tests called hyppo is available from https://hyppo.neurodata.io/.},
archiveprefix = {arXiv},
copyright = {All rights reserved}
}
@misc{shen2020-LearningInterpretable,
title = {Learning {{Interpretable Characteristic Kernels}} via {{Decision Forests}}},
author = {Shen, Cencheng and Panda, Sambit and Vogelstein, Joshua T.},
year = {2020},
month = sep,
number = {arXiv:1812.00029},
eprint = {1812.00029},
eprinttype = {arxiv},
primaryclass = {cs, stat},
publisher = {{arXiv}},
doi = {10.48550/arXiv.1812.00029},
abstract = {Decision forests are popular tools for classification and regression. These forests naturally produce proximity matrices measuring how often each pair of observations lies in the same leaf node. It has been demonstrated that these proximity matrices can be thought of as kernels, connecting the decision forest literature to the extensive kernel machine literature. While other kernels are known to have strong theoretical properties such as being characteristic, no similar result is available for any decision forest based kernel. In this manuscript,we prove that the decision forest induced proximity can be made characteristic, which can be used to yield a universally consistent statistic for testing independence. We demonstrate the performance of the induced kernel on a suite of 20 high-dimensional independence test settings. We also show how this learning kernel offers insights into relative feature importance. The decision forest induced kernel typically achieves substantially higher testing power than existing popular methods in statistical tests.},
archiveprefix = {arXiv},
copyright = {All rights reserved}
}
@article{shen2022-ChiSquareTest,
title = {The {{Chi-Square Test}} of {{Distance Correlation}}},
author = {Shen, Cencheng and Panda, Sambit and Vogelstein, Joshua T.},
year = {2022},
month = jan,
journal = {Journal of Computational and Graphical Statistics},
volume = {31},
number = {1},
pages = {254--262},
publisher = {{Taylor \& Francis}},
issn = {1061-8600},
doi = {10.1080/10618600.2021.1938585},
abstract = {Distance correlation has gained much recent attention in the data science community: the sample statistic is straightforward to compute and asymptotically equals zero if and only if independence, making it an ideal choice to discover any type of dependency structure given sufficient sample size. One major bottleneck is the testing process: because the null distribution of distance correlation depends on the underlying random variables and metric choice, it typically requires a permutation test to estimate the null and compute the p-value, which is very costly for large amount of data. To overcome the difficulty, in this article, we propose a chi-squared test for distance correlation. Method-wise, the chi-squared test is nonparametric, extremely fast, and applicable to bias-corrected distance correlation using any strong negative type metric or characteristic kernel. The test exhibits a similar testing power as the standard permutation test, and can be used for K-sample and partial testing. Theory-wise, we show that the underlying chi-squared distribution well approximates and dominates the limiting null distribution in upper tail, prove the chi-squared test can be valid and universally consistent for testing independence, and establish a testing power inequality with respect to the permutation test. Supplementary files for this article are available online.},
copyright = {All rights reserved},
pmid = {35707063},
annotation = {\_eprint: https://doi.org/10.1080/10618600.2021.1938585}
}
@article{wilson2018-SelectiveMechanically,
title = {Selective and {{Mechanically Robust Sensors}} for {{Electrochemical Measurements}} of {{Real-Time Hydrogen Peroxide Dynamics}} in {{Vivo}}},
author = {Wilson, Leslie R. and Panda, Sambit and Schmidt, Andreas C. and Sombers, Leslie A.},
year = {2018},
month = jan,
journal = {Analytical Chemistry},
volume = {90},
number = {1},
pages = {888--895},
publisher = {{American Chemical Society}},
issn = {0003-2700},
doi = {10.1021/acs.analchem.7b03770},
abstract = {Hydrogen peroxide (H2O2) is an endogenous molecule that plays several important roles in brain function: it is generated in cellular respiration, serves as a modulator of dopaminergic signaling, and its presence can indicate the upstream production of more aggressive reactive oxygen species (ROS). H2O2 has been implicated in several neurodegenerative diseases, including Parkinson’s disease (PD), creating a critical need to identify mechanisms by which H2O2 modulates cellular processes in general and how it affects the dopaminergic nigrostriatal pathway, in particular. Furthermore, there is broad interest in selective electrochemical quantification of H2O2, because it is often enzymatically generated at biosensors as a reporter for the presence of nonelectroactive target molecules. H2O2 fluctuations can be monitored in real time using fast-scan cyclic voltammetry (FSCV) coupled with carbon-fiber microelectrodes. However, selective identification is a critical issue when working in the presence of other molecules that generate similar voltammograms, such as adenosine and histamine. We have addressed this problem by fabricating a robust, H2O2-selective electrode. 1,3-Phenylenediamine (mPD) was electrodeposited on a carbon-fiber microelectrode to create a size-exclusion membrane, rendering the electrode sensitive to H2O2 fluctuations and pH shifts but not to other commonly studied neurochemicals. The electrodes are described and characterized herein. The data demonstrate that this technology can be used to ensure the selective detection of H2O2, enabling confident characterization of the role this molecule plays in normal physiological function as well as in the progression of PD and other neuropathies involving oxidative stress.},
copyright = {All rights reserved}
}
@misc{xu2021-WhenAre,
title = {When Are {{Deep Networks}} Really Better than {{Decision Forests}} at Small Sample Sizes, and How?},
author = {Xu, Haoyin and Kinfu, Kaleab A. and LeVine, Will and Panda, Sambit and Dey, Jayanta and Ainsworth, Michael and Peng, Yu-Chung and Kusmanov, Madi and Engert, Florian and White, Christopher M. and Vogelstein, Joshua T. and Priebe, Carey E.},
year = {2021},
month = nov,
number = {arXiv:2108.13637},
eprint = {2108.13637},
eprinttype = {arxiv},
primaryclass = {cs, q-bio, stat},
publisher = {{arXiv}},
doi = {10.48550/arXiv.2108.13637},
abstract = {Deep networks and decision forests (such as random forests and gradient boosted trees) are the leading machine learning methods for structured and tabular data, respectively. Many papers have empirically compared large numbers of classifiers on one or two different domains (e.g., on 100 different tabular data settings). However, a careful conceptual and empirical comparison of these two strategies using the most contemporary best practices has yet to be performed. Conceptually, we illustrate that both can be profitably viewed as "partition and vote" schemes. Specifically, the representation space that they both learn is a partitioning of feature space into a union of convex polytopes. For inference, each decides on the basis of votes from the activated nodes. This formulation allows for a unified basic understanding of the relationship between these methods. Empirically, we compare these two strategies on hundreds of tabular data settings, as well as several vision and auditory settings. Our focus is on datasets with at most 10,000 samples, which represent a large fraction of scientific and biomedical datasets. In general, we found forests to excel at tabular and structured data (vision and audition) with small sample sizes, whereas deep nets performed better on structured data with larger sample sizes. This suggests that further gains in both scenarios may be realized via further combining aspects of forests and networks. We will continue revising this technical report in the coming months with updated results.},
archiveprefix = {arXiv},
copyright = {All rights reserved}
}
@misc{xu2022-SimplestStreaming,
title = {Simplest {{Streaming Trees}}},
author = {Xu, Haoyin and Dey, Jayanta and Panda, Sambit and Vogelstein, Joshua T.},
year = {2022},
month = mar,
number = {arXiv:2110.08483},
eprint = {2110.08483},
eprinttype = {arxiv},
primaryclass = {cs},
publisher = {{arXiv}},
doi = {10.48550/arXiv.2110.08483},
abstract = {Decision forests, including random forests and gradient boosting trees, remain the leading machine learning methods for many real-world data problems, specifically on tabular data. However, current standard implementations only operate in batch mode, and therefore cannot incrementally update when more data arrive. Several previous works developed streaming trees and ensembles to overcome this limitation. Nonetheless, we found that those state-of-the-art algorithms suffer from a number of drawbacks, including performing very poorly on some problems and requiring a huge amount of memory on others. We therefore developed the simplest possible extension of decision trees we could think of: given new data, simply update existing trees by continuing to grow them, and replace some old trees with new ones to control the total number of trees. On three standard datasets, we illustrate that our approach, Stream Decision Forest (SDF), does not suffer from either of the aforementioned limitations. In a benchmark suite containing 72 classification problems (the OpenML-CC18 data suite), we illustrate that our approach often performs as well, and sometimes better even, than the batch mode decision forest algorithm. Thus, we believe SDFs establish a simple standard for streaming trees and forests that could readily be applied to many real-world problems, including those with distribution drift and continual learning.},
archiveprefix = {arXiv},
copyright = {All rights reserved}
}