-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdbCAN3_fam_summary.py
44 lines (34 loc) · 1.69 KB
/
dbCAN3_fam_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
import os
# Define the directory where the files are stored
directory = '/home/weilan/Desktop/Raf_shotgun_desktop/Raf_shotgun_dbCAN3_contigs/Contigs_PUL_Substrates/fam_abund'
# List all files in the directory that end with '.out'
file_names = [f for f in os.listdir(directory) if f.endswith('.out')]
# Display the files
print(file_names)
# Create an empty DataFrame again for the adjusted data
adjusted_data = pd.DataFrame()
# Re-process each file with the correct sample ID extraction
for file_name in file_names:
file_path = os.path.join(directory, file_name)
# Read the file
data = pd.read_csv(file_path, sep="\t", usecols=["Family", "Abundance"])
# Correctly extract the sample ID from the filename
sample_id = file_name.split("_fam_abund.out")[0] # Gets the part before '_fam_abund.out'
data.rename(columns={"Abundance": sample_id}, inplace=True)
# Merge the data into the adjusted DataFrame
if adjusted_data.empty:
adjusted_data = data.set_index("Family")
else:
adjusted_data = adjusted_data.join(data.set_index("Family"), how="outer")
# Fill NaN values with 0 (assuming no abundance as 0)
adjusted_data.fillna(0, inplace=True)
# Reset index to turn 'Family' back into a column
adjusted_data.reset_index(inplace=True)
# Define the new path for saving the output file
output_directory = '/home/weilan/Desktop/Raf_shotgun_desktop/Raf_shotgun_dbCAN3_contigs/Contigs_PUL_Substrates'
adjusted_output_path = os.path.join(output_directory, "adjusted_combined_fam_abund.csv")
# Save the adjusted combined data to the new CSV file path
adjusted_data.to_csv(adjusted_output_path, index=False)
# Show the adjusted DataFrame structure
adjusted_data.head()