-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Mehmet <[email protected]>
- Loading branch information
0 parents
commit 7418349
Showing
23 changed files
with
12,811,142 additions
and
0 deletions.
There are no files selected for viewing
Binary file added
BIN
+5.73 MB
...alyses of 10,376 individuals in the Westlake BioBank for Chinese (WBBC) pilot project.pdf
Binary file not shown.
Binary file added
BIN
+448 KB
PCA and ADMIXTURE analysis of the Han Chinese populations and East Asians.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD) | ||
#There are regions of long-range, high linkage diequilibrium in the human genome. These regions should be excluded when performing certain analyses such as principal component analysis on genotype data. | ||
#Here is a list of positions for GRCH Build 37 | ||
#You can remove these regions from a PED file using the following PLINK commands. | ||
#plink --file mydata --make-set high-ld.txt --write-set --out hild | ||
#plink --file mydata --exclude hild.set --recode --out mydatatrimmed | ||
#Chr Start Stop | ||
1 48000000 52000000 | ||
2 86000000 100500000 | ||
2 134500000 138000000 | ||
2 183000000 190000000 | ||
3 47500000 50000000 | ||
3 83500000 87000000 | ||
3 89000000 97500000 | ||
5 44500000 50500000 | ||
5 98000000 100500000 | ||
5 129000000 132000000 | ||
5 135500000 138500000 | ||
6 25000000 35000000 | ||
6 57000000 64000000 | ||
6 140000000 142500000 | ||
7 55000000 66000000 | ||
8 7000000 13000000 | ||
8 43000000 50000000 | ||
8 112000000 115000000 | ||
10 37000000 43000000 | ||
11 46000000 57000000 | ||
11 87500000 90500000 | ||
12 33000000 40000000 | ||
12 109500000 112000000 | ||
20 32000000 34500000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD) | ||
#There are regions of long-range, high linkage diequilibrium in the human genome. These regions should be excluded when performing certain analyses such as principal component analysis on genotype data. | ||
#Here is a list of positions for GRCH Build 38 | ||
#You can remove these regions from a PED file using the following PLINK commands. | ||
#plink --file mydata --make-set high-ld.txt --write-set --out hild | ||
#plink --file mydata --exclude hild.set --recode --out mydatatrimmed | ||
#Chr Start Stop | ||
1 47761740 51761740 | ||
1 125169943 125170022 | ||
1 144106678 144106709 | ||
1 181955019 181955047 | ||
2 85919365 100517106 | ||
2 87416141 87416186 | ||
2 87417804 87417863 | ||
2 87418924 87418981 | ||
2 89917298 89917322 | ||
2 135275091 135275210 | ||
2 182427027 189427029 | ||
2 207609786 207609808 | ||
3 47483505 49987563 | ||
3 83368158 86868160 | ||
5 44464140 51168409 | ||
5 129636407 132636409 | ||
6 25391792 33424245 | ||
6 26726947 26726981 | ||
6 57788603 58453888 | ||
6 61109122 61357029 | ||
6 61424410 61424451 | ||
6 139637169 142137170 | ||
7 54964812 66897578 | ||
7 62182500 62277073 | ||
8 8105067 12105082 | ||
8 43025699 48924888 | ||
8 47303500 47317337 | ||
8 110918594 113918595 | ||
9 40365644 40365693 | ||
9 64198500 64200392 | ||
9 88958735 88959017 | ||
10 36671065 43184546 | ||
10 41693521 41885273 | ||
11 88127183 91127184 | ||
12 32955798 41319931 | ||
12 34639034 34639084 | ||
14 87391719 87391996 | ||
14 94658026 94658080 | ||
17 43159541 43159574 | ||
20 4031884 4032441 | ||
20 33948532 36438183 | ||
22 30060084 30060162 | ||
22 42980497 42980522 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import sys, os | ||
import argparse | ||
import time | ||
from multiprocessing import cpu_count | ||
import wbbc | ||
|
||
sys.path.append(os.path.dirname(__file__)) | ||
|
||
|
||
def arguments(): | ||
# argument parser | ||
parser = argparse.ArgumentParser() | ||
|
||
# TSV raw data file | ||
parser.add_argument( | ||
"-tf", | ||
"--tsvFile", | ||
nargs="?", | ||
required=False, | ||
help="file name of TSV raw data", | ||
) | ||
|
||
# TSV raw data path | ||
parser.add_argument( | ||
"-tp", | ||
"--tsvPath", | ||
nargs="?", | ||
required=False, | ||
help="path of TSV raw data", | ||
) | ||
|
||
# filename of allele frequency without extension | ||
parser.add_argument( | ||
"-af", | ||
"--alleleFrqFile", | ||
nargs="?", | ||
default="wbbc_{}".format(time.strftime("%Y-%m-%d", time.localtime())), | ||
help="file name of allele frequency without extension", | ||
) | ||
|
||
# decimal digits of alleles frequency | ||
parser.add_argument( | ||
"-ad", | ||
"--afDigits", | ||
nargs="?", | ||
type=int, | ||
default=6, | ||
help="decimal digits of alleles frequency", | ||
) | ||
|
||
# threshold of standard deviation of alleles frequency | ||
parser.add_argument( | ||
"-sd", | ||
"--stdev", | ||
nargs="?", | ||
type=float, | ||
default=0.03, | ||
help="threshold of standard deviation of alleles frequency", | ||
) | ||
|
||
# filename of regions of High Linkage Disequilibrium | ||
parser.add_argument( | ||
"-hld", | ||
"--highLD", | ||
nargs="?", | ||
default="", | ||
help="filename of regions of High Linkage Disequilibrium. Refer to: https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD)", | ||
) | ||
|
||
# threshold of standard deviation of alleles frequency | ||
parser.add_argument( | ||
"-th", | ||
"--thread", | ||
nargs="?", | ||
type=int, | ||
default=cpu_count() if cpu_count() > 0 else 4, | ||
help="threads count for concurrency", | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
# 基于基因芯片的TSV数据,从WBBC的VCF中筛选出符合条件的突变和频率数据,生成alleles和frequency文件作为祖源计算器的参考数据集 | ||
def main(): | ||
# get arguments | ||
args = arguments() | ||
|
||
# TSV文件集合 | ||
tsvFiles = [] | ||
|
||
# 指定TSV文件 | ||
if args.tsvFile != None: | ||
tsvFiles.extend(args.tsvFile.strip().split()) | ||
|
||
# 指定TSV目录 | ||
if args.tsvPath != None: | ||
tsvPath = args.tsvPath.strip() | ||
for file in os.listdir(tsvPath): | ||
if os.path.isfile(os.path.join(tsvPath, file)): | ||
tsvFiles.append(os.path.join(tsvPath, file)) | ||
|
||
if len(tsvFiles) > 0: | ||
isTsvExist = True | ||
for file in tsvFiles: | ||
if not os.access(file, os.F_OK): | ||
print("TSV file {} is not available.".format(file)) | ||
isTsvExist = False | ||
else: | ||
print("Please set one more TSV files or path.") | ||
isTsvExist = False | ||
|
||
if isTsvExist: | ||
wbbc.make_allele_frq( | ||
tsvFiles, | ||
args.highLD, | ||
args.alleleFrqFile, | ||
args.afDigits, | ||
args.stdev, | ||
args.thread, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.