Skip to content

Commit

Permalink
Signed-off-by: Mehmet <[email protected]>
Browse files Browse the repository at this point in the history
  • Loading branch information
mahui-cn committed Oct 1, 2024
0 parents commit 7418349
Show file tree
Hide file tree
Showing 23 changed files with 12,811,142 additions and 0 deletions.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
31 changes: 31 additions & 0 deletions high_ld/high_ld_hg19.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD)
#There are regions of long-range, high linkage diequilibrium in the human genome. These regions should be excluded when performing certain analyses such as principal component analysis on genotype data.
#Here is a list of positions for GRCH Build 37
#You can remove these regions from a PED file using the following PLINK commands.
#plink --file mydata --make-set high-ld.txt --write-set --out hild
#plink --file mydata --exclude hild.set --recode --out mydatatrimmed
#Chr Start Stop
1 48000000 52000000
2 86000000 100500000
2 134500000 138000000
2 183000000 190000000
3 47500000 50000000
3 83500000 87000000
3 89000000 97500000
5 44500000 50500000
5 98000000 100500000
5 129000000 132000000
5 135500000 138500000
6 25000000 35000000
6 57000000 64000000
6 140000000 142500000
7 55000000 66000000
8 7000000 13000000
8 43000000 50000000
8 112000000 115000000
10 37000000 43000000
11 46000000 57000000
11 87500000 90500000
12 33000000 40000000
12 109500000 112000000
20 32000000 34500000
50 changes: 50 additions & 0 deletions high_ld/high_ld_hg38.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD)
#There are regions of long-range, high linkage diequilibrium in the human genome. These regions should be excluded when performing certain analyses such as principal component analysis on genotype data.
#Here is a list of positions for GRCH Build 38
#You can remove these regions from a PED file using the following PLINK commands.
#plink --file mydata --make-set high-ld.txt --write-set --out hild
#plink --file mydata --exclude hild.set --recode --out mydatatrimmed
#Chr Start Stop
1 47761740 51761740
1 125169943 125170022
1 144106678 144106709
1 181955019 181955047
2 85919365 100517106
2 87416141 87416186
2 87417804 87417863
2 87418924 87418981
2 89917298 89917322
2 135275091 135275210
2 182427027 189427029
2 207609786 207609808
3 47483505 49987563
3 83368158 86868160
5 44464140 51168409
5 129636407 132636409
6 25391792 33424245
6 26726947 26726981
6 57788603 58453888
6 61109122 61357029
6 61424410 61424451
6 139637169 142137170
7 54964812 66897578
7 62182500 62277073
8 8105067 12105082
8 43025699 48924888
8 47303500 47317337
8 110918594 113918595
9 40365644 40365693
9 64198500 64200392
9 88958735 88959017
10 36671065 43184546
10 41693521 41885273
11 88127183 91127184
12 32955798 41319931
12 34639034 34639084
14 87391719 87391996
14 94658026 94658080
17 43159541 43159574
20 4031884 4032441
20 33948532 36438183
22 30060084 30060162
22 42980497 42980522
126 changes: 126 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-

import sys, os
import argparse
import time
from multiprocessing import cpu_count
import wbbc

sys.path.append(os.path.dirname(__file__))


def arguments():
# argument parser
parser = argparse.ArgumentParser()

# TSV raw data file
parser.add_argument(
"-tf",
"--tsvFile",
nargs="?",
required=False,
help="file name of TSV raw data",
)

# TSV raw data path
parser.add_argument(
"-tp",
"--tsvPath",
nargs="?",
required=False,
help="path of TSV raw data",
)

# filename of allele frequency without extension
parser.add_argument(
"-af",
"--alleleFrqFile",
nargs="?",
default="wbbc_{}".format(time.strftime("%Y-%m-%d", time.localtime())),
help="file name of allele frequency without extension",
)

# decimal digits of alleles frequency
parser.add_argument(
"-ad",
"--afDigits",
nargs="?",
type=int,
default=6,
help="decimal digits of alleles frequency",
)

# threshold of standard deviation of alleles frequency
parser.add_argument(
"-sd",
"--stdev",
nargs="?",
type=float,
default=0.03,
help="threshold of standard deviation of alleles frequency",
)

# filename of regions of High Linkage Disequilibrium
parser.add_argument(
"-hld",
"--highLD",
nargs="?",
default="",
help="filename of regions of High Linkage Disequilibrium. Refer to: https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD)",
)

# threshold of standard deviation of alleles frequency
parser.add_argument(
"-th",
"--thread",
nargs="?",
type=int,
default=cpu_count() if cpu_count() > 0 else 4,
help="threads count for concurrency",
)

return parser.parse_args()


# 基于基因芯片的TSV数据,从WBBC的VCF中筛选出符合条件的突变和频率数据,生成alleles和frequency文件作为祖源计算器的参考数据集
def main():
# get arguments
args = arguments()

# TSV文件集合
tsvFiles = []

# 指定TSV文件
if args.tsvFile != None:
tsvFiles.extend(args.tsvFile.strip().split())

# 指定TSV目录
if args.tsvPath != None:
tsvPath = args.tsvPath.strip()
for file in os.listdir(tsvPath):
if os.path.isfile(os.path.join(tsvPath, file)):
tsvFiles.append(os.path.join(tsvPath, file))

if len(tsvFiles) > 0:
isTsvExist = True
for file in tsvFiles:
if not os.access(file, os.F_OK):
print("TSV file {} is not available.".format(file))
isTsvExist = False
else:
print("Please set one more TSV files or path.")
isTsvExist = False

if isTsvExist:
wbbc.make_allele_frq(
tsvFiles,
args.highLD,
args.alleleFrqFile,
args.afDigits,
args.stdev,
args.thread,
)


if __name__ == "__main__":
main()
Loading

0 comments on commit 7418349

Please sign in to comment.