Skip to content

Commit

Permalink
输出的祖源模型,放入文件夹 admix_model
Browse files Browse the repository at this point in the history
  • Loading branch information
mahui-cn committed Oct 8, 2024
1 parent 27f45a1 commit 97fb49a
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 36 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
__pycache__/*
wbbc_vcf/*
wbbc_vcf_hg38/*
*.alleles
*.F
admix_model/*
4 changes: 3 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
"-tp",
"tsv_tmpl",
"-hld",
"high_ld/high_ld_hg19.txt"
"high_ld/high_ld_hg19.txt",
"-mp",
"admix_model"
]
}
]
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Create Admixture model files from WBBC project of West Lake University
Create admixture model files (*.F, *.alleles) based on WBBC project of West Lake University. Microarray chip files (TSV format) from popular genetic testing companies are used as referenced SNPs. Linkage disequilibrium SNPs could be eliminated optionally. The output admixture model files could be used for model based admixture calculator such as https://geneu.xyz/user-profile
Create admixture model files (*.F, *.alleles) based on WBBC project of West Lake University. Microarray chip files (TSV format) from popular genetic testing companies are used as referenced SNPs. Linkage disequilibrium SNPs could be eliminated optionally (https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD)). The output admixture model files could be used for model based admixture calculator such as https://geneu.xyz/user-profile

This program features multiple threads acording to user's CPU cores, default is 4 threads.

Expand Down
72 changes: 43 additions & 29 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,23 @@ def arguments():
help="path of TSV raw data",
)

# Path of admix model files
parser.add_argument(
"-mp",
"--modelPath",
nargs="?",
required=False,
default=".",
help="path of admix model files",
)

# filename of allele frequency without extension
parser.add_argument(
"-af",
"--alleleFrqFile",
nargs="?",
default="wbbc_{}".format(time.strftime("%Y-%m-%d", time.localtime())),
help="file name of allele frequency without extension",
help="file name of allele and frequency without extension",
)

# decimal digits of alleles frequency
Expand Down Expand Up @@ -84,43 +94,47 @@ def arguments():

# 基于基因芯片的TSV数据,从WBBC的VCF中筛选出符合条件的突变和频率数据,生成alleles和frequency文件作为祖源计算器的参考数据集
def main():
# get arguments
args = arguments()

# TSV文件集合
tsvFiles = []

# 指定TSV文件
if args.tsvFile != None:
tsvFiles.extend(args.tsvFile.strip().split())

# 指定TSV目录
if args.tsvPath != None:
tsvPath = args.tsvPath.strip()
for file in os.listdir(tsvPath):
if os.path.isfile(os.path.join(tsvPath, file)):
tsvFiles.append(os.path.join(tsvPath, file))

if len(tsvFiles) > 0:
isTsvExist = True
for file in tsvFiles:
if not os.access(file, os.F_OK):
print("TSV file {} is not available.".format(file))
isTsvExist = False
else:
print("Please set one more TSV files or path.")
isTsvExist = False

if isTsvExist:
try:
# get arguments
args = arguments()

# TSV文件集合
tsvFiles = []

# 指定TSV文件
if args.tsvFile != None:
tsvFiles.extend(args.tsvFile.strip().split())

# 指定TSV目录
if args.tsvPath != None:
tsvPath = args.tsvPath.strip()
for file in os.listdir(tsvPath):
if os.path.isfile(os.path.join(tsvPath, file)):
tsvFiles.append(os.path.join(tsvPath, file))

if len(tsvFiles) > 0:
for file in tsvFiles:
if not os.access(file, os.F_OK):
raise Exception("TSV file {} is not available.".format(file))
else:
raise Exception("Please set one more TSV files or path.")

if not os.path.exists(args.modelPath):
raise Exception("Please set correct path of admix model files.")

wbbc.make_allele_frq(
tsvFiles,
args.highLD,
args.modelPath,
args.alleleFrqFile,
args.afDigits,
args.stdev,
args.thread,
)

except Exception as e:
print(f"发生错误:{e}")


if __name__ == "__main__":
main()
9 changes: 6 additions & 3 deletions wbbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


# https://wbbc.westlake.edu.cn/
# 根据西湖中国样本库,生成祖源模型数据文件
# 根据西湖中国样本库,生成祖源模型文件
# The VCF is annotated with rsIDs from dbSNP151, and the following INFO fields:
# AC:Allele count in called genotypes in WBBC
# AF:Allele frequency in called genotypes in WBBC
Expand All @@ -27,6 +27,7 @@
def make_allele_frq(
tsvFiles,
highLDFileName="",
modelPath=".",
alleleFrqFile="wbbc",
afDigits=6,
stdDevThreshold=0.03,
Expand All @@ -40,9 +41,11 @@ def make_allele_frq(
snp_total_count = 0

with open(
"{}.alleles".format(alleleFrqFile), "w", encoding="utf-8"
"{}/{}.alleles".format(modelPath, alleleFrqFile), "w", encoding="utf-8"
) as alleleFile:
with open("{}.F".format(alleleFrqFile), "w", encoding="utf-8") as frqFile:
with open(
"{}/{}.F".format(modelPath, alleleFrqFile), "w", encoding="utf-8"
) as frqFile:
# 多线程遍历所有VCF文件
with ThreadPoolExecutor(max_workers=maxWorkers) as t:
task_list = []
Expand Down

0 comments on commit 97fb49a

Please sign in to comment.