-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRegtools.py
72 lines (56 loc) · 3 KB
/
Regtools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import glob
import subprocess
import pandas as pd
# Define input directory containing SAM files
input_dir = "/home/weilan/STAR/subset/Regtools/HISAT2_regtools/SAM"
# Step 1: Convert SAM to BAM and index BAM files
sam_files = glob.glob(os.path.join(input_dir, "*.sam"))
for sam_file in sam_files:
base = os.path.basename(sam_file).replace(".sam", "")
bam_file = os.path.join(input_dir, base + ".bam")
# Check if BAM file already exists
if os.path.exists(bam_file):
print(f"BAM file already exists for {sam_file}. Skipping conversion.")
else:
# Convert SAM to BAM
os.system(f"samtools view -b {sam_file} > {bam_file}")
# Index BAM file
os.system(f"samtools index {bam_file}")
print(f"Converted {sam_file} to BAM and indexed {bam_file}.")
print("SAM to BAM conversion and indexing completed.")
# Step 2: Extract junctions and annotate junctions from BAM files
bam_files = glob.glob(os.path.join(input_path, "*hisat2.dam"))
for bam_file in bam_files:
base = os.path.basename(bam_file).replace("hisat2.bam", "")
# Extract junctions
os.system(f"/home/hari/Tools/regtools/build/regtools junctions extract {bam_file} -s XS > {base}_junctions.bed")
# Annotate junctions
os.system(f"/home/hari/Tools/regtools/build/regtools junctions annotate {base}_junctions.bed /home/weilan/HISAT2/Homo_sapiens.GRCh38.dna.primary_assembly.fasta /home/weilan/HISAT2/Homo_sapiens.GRCh38.111.gtf -o {base}hisat2_junctions_annotation.txt")
# Step 3: Summarize counts from text files
# Discover text files in a directory
dir_path = "/home/weilan/STAR/subset/Regtools/HISAT2_regtools/" # Replace with the path to your directory containing text files
file_paths = glob.glob(os.path.join(dir_path, "*hisat2_junctions_annotation.txt"))
# Read and summarize counts from text files
def summarize_counts(file_path):
# Read counts data from text file
counts_data = pd.read_table(file_path) # Adjust parameters based on file format
# Calculate total counts for column named "Column4"
total_counts = counts_data["name"].count()
# Calculate counts for categories in two columns
category_counts_col1 = counts_data["anchor"].value_counts()
category_counts_col2 = counts_data["known_junction"].value_counts()
# Return summarized counts data
return pd.DataFrame({
"File": [os.path.basename(file_path)],
"Total_Counts": [total_counts],
"Category_Counts_Column1": [category_counts_col1.to_dict()],
"Category_Counts_Column2": [category_counts_col2.to_dict()]
})
# Apply the summarize_counts function to each file path
summarized_counts = [summarize_counts(file_path) for file_path in file_paths]
# Step 4: Combine summarized counts data into one data frame
combined_counts = pd.concat(summarized_counts)
# Step 5: Write combined counts data to a new text file
output_file_path = os.path.join(dir_path, "hisat2_combined_counts_summary.txt")
combined_counts.to_csv(output_file_path, sep="\t", index=False)