Skip to content

Commit

Permalink
unzip_or_redownload.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
TeamSPoon committed Dec 26, 2023
1 parent 3a6f471 commit 5ad5524
Show file tree
Hide file tree
Showing 8 changed files with 4,632 additions and 31 deletions.
28 changes: 28 additions & 0 deletions data/essential_pairs_2023_05.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
ncRNA_genes primaryId symbol primaryId taxonId primaryId soTermId primaryId gene_geneId taxonId soTermId taxonId gene_geneId soTermId gene_geneId gene_geneId gene_symbol gene_geneId gene_locusTag
ncRNA_genes_synonyms symbol1 symbol2
ncRNA_genes_cross_references symbol1 symbol2
ncRNA_genes_related_sequences primaryId sequenceId sequenceId relationship
ncRNA_genes_gene_synonyms symbol1 symbol2
ncRNA_genes_publications primaryId publication
ncRNA_genes_genome_locations primaryId assembly primaryId gca_accession primaryId INSDC_accession assembly gca_accession assembly INSDC_accession assembly chromosome assembly strand assembly startPosition assembly endPosition gca_accession INSDC_accession INSDC_accession chromosome INSDC_accession strand INSDC_accession startPosition INSDC_accession endPosition
fbgn_fbtr_fbpp_expanded organism gene_ID gene_ID gene_symbol gene_ID gene_fullname gene_ID annotation_ID gene_ID transcript_ID gene_ID polypeptide_ID annotation_ID transcript_ID annotation_ID polypeptide_ID transcript_type transcript_ID transcript_ID transcript_symbol transcript_ID polypeptide_ID polypeptide_ID polypeptide_symbol
dmel_human_orthologs_disease Dmel_gene_ID Dmel_gene_symbol Dmel_gene_ID Human_gene_HGNC_ID Dmel_gene_ID Human_gene_OMIM_ID Dmel_gene_ID DIOPT_score Dmel_gene_ID OMIM_Phenotype_IDs[name] Human_gene_HGNC_ID Human_gene_OMIM_ID Human_gene_HGNC_ID Human_gene_symbol Human_gene_HGNC_ID DIOPT_score Human_gene_HGNC_ID OMIM_Phenotype_IDs[name] Human_gene_OMIM_ID Human_gene_symbol Human_gene_OMIM_ID DIOPT_score Human_gene_OMIM_ID OMIM_Phenotype_IDs[name] DIOPT_score OMIM_Phenotype_IDs[name]
Dmel_enzyme_data gene_group_id gene_group_name gene_group_id gene_group_GO_id(s) gene_group_id gene_group_EC_number(s) gene_group_id gene_id gene_group_id gene_EC_number(s) gene_group_GO_id(s) gene_group_EC_number(s) gene_group_GO_id(s) gene_id gene_group_GO_id(s) gene_EC_number(s) gene_group_EC_number(s) gene_id gene_group_EC_number(s) gene_EC_number(s) gene_id gene_symbol gene_id gene_name gene_id gene_EC_number(s)
gene_groups_HGNC FB_group_id FB_group_symbol FB_group_id FB_group_name FB_group_id HGNC_family_ID
best_gene_summary FBgn_ID Gene_Symbol FBgn_ID Summary_Source FBgn_ID Summary
gene_map_table organism_abbreviation primary_FBid current_symbol primary_FBid primary_FBid recombination_loc primary_FBid cytogenetic_loc primary_FBid sequence_loc
pathway_group_data FB_group_id FB_group_symbol FB_group_id FB_group_name FB_group_id Parent_FB_group_id FB_group_id Group_member_FB_gene_id Parent_FB_group_id Parent_FB_group_symbol Parent_FB_group_id Group_member_FB_gene_id Group_member_FB_gene_id Group_member_FB_gene_symbol
genotype_phenotype_data genotype_symbols genotype_FBids genotype_FBids phenotype_id genotype_FBids qualifier_ids genotype_FBids reference phenotype_name phenotype_id phenotype_id qualifier_ids qualifier_names qualifier_ids
gene_group_data FB_group_id FB_group_symbol FB_group_id FB_group_name FB_group_id Parent_FB_group_id FB_group_id Group_member_FB_gene_id Parent_FB_group_id Parent_FB_group_symbol Parent_FB_group_id Group_member_FB_gene_id Group_member_FB_gene_id Group_member_FB_gene_symbol
dmel_unique_protein_isoforms FBgn FB_gene_symbol FBgn representative_protein FBgn identical_protein(s)
dmel_gene_sequence_ontology_annotations gene_primary_id gene_symbol gene_primary_id so_term_id
scRNA-Seq_gene_expression Pub_ID Pub_miniref Pub_ID Clustering_Analysis_ID Pub_ID Cluster_ID Pub_ID Cluster_Cell_Type_ID Pub_ID Gene_ID Clustering_Analysis_ID Clustering_Analysis_Name Clustering_Analysis_ID Source_Tissue_Sex Clustering_Analysis_ID Source_Tissue_Stage Clustering_Analysis_ID Source_Tissue_Anatomy Clustering_Analysis_ID Cluster_ID Clustering_Analysis_ID Cluster_Cell_Type_ID Clustering_Analysis_ID Gene_ID Source_Tissue_Sex Cluster_ID Source_Tissue_Sex Cluster_Cell_Type_ID Source_Tissue_Stage Cluster_ID Source_Tissue_Stage Cluster_Cell_Type_ID Source_Tissue_Anatomy Cluster_ID Source_Tissue_Anatomy Cluster_Cell_Type_ID Cluster_ID Cluster_Name Cluster_ID Cluster_Cell_Type_ID Cluster_ID Gene_ID Cluster_Cell_Type_ID Cluster_Cell_Type_Name Cluster_Cell_Type_ID Gene_ID Gene_ID Gene_Symbol Gene_ID Mean_Expression Gene_ID Spread
gene_genetic_interactions Starting_gene(s)_symbol Starting_gene(s)_FBgn Starting_gene(s)_FBgn Interacting_gene(s)_symbol Starting_gene(s)_FBgn Interacting_gene(s)_FBgn Starting_gene(s)_FBgn Interaction_type Starting_gene(s)_FBgn Publication_FBrf Interacting_gene(s)_symbol Interacting_gene(s)_FBgn Interacting_gene(s)_FBgn Interaction_type Interacting_gene(s)_FBgn Publication_FBrf
gene_rpkm_report Release_ID FBgn# FBgn# GeneSymbol FBgn# Parent_library_FBlc# FBgn# RNASource_FBlc# FBgn# RPKM_value FBgn# Bin_value FBgn# Unique_exon_base_count FBgn# Total_exon_base_count FBgn# Count_used Parent_library_FBlc# Parent_library_name Parent_library_FBlc# RNASource_FBlc# RNASource_FBlc# RNASource_name
physical_interactions_mitab ID(s) Interactor A ID(s) Interactor B ID(s) Interactor A Alt ID(s) Interactor A ID(s) Interactor A Alias(es) Interactor A ID(s) Interactor A Interaction Detection Method(s) ID(s) Interactor A Publication ID(s) ID(s) Interactor A Taxid Interactor A ID(s) Interactor A Interaction Type(s) ID(s) Interactor A Source Database(s) ID(s) Interactor A Interaction Identifier(s) ID(s) Interactor A Biological Role(s) Interactor A ID(s) Interactor A Experimental Role(s) Interactor A ID(s) Interactor A Type(s) Interactor A ID(s) Interactor A Xref(s) Interactor A ID(s) Interactor A Interaction Xref(s) ID(s) Interactor A Annotation(s) Interactor A ID(s) Interactor A Interaction Annotation(s) ID(s) Interactor B Alt ID(s) Interactor B ID(s) Interactor B Alias(es) Interactor B ID(s) Interactor B Interaction Detection Method(s) ID(s) Interactor B Publication ID(s) ID(s) Interactor B Taxid Interactor B ID(s) Interactor B Interaction Type(s) ID(s) Interactor B Source Database(s) ID(s) Interactor B Interaction Identifier(s) ID(s) Interactor B Biological Role(s) Interactor B ID(s) Interactor B Experimental Role(s) Interactor B ID(s) Interactor B Type(s) Interactor B ID(s) Interactor B Xref(s) Interactor B ID(s) Interactor B Interaction Xref(s) ID(s) Interactor B Annotation(s) Interactor B ID(s) Interactor B Interaction Annotation(s) ID(s) Interactor B Host Organism(s) Publication 1st Author(s) Publication ID(s) Publication ID(s) Interaction Identifier(s) Interaction Type(s) Source Database(s) Interaction Type(s) Interaction Identifier(s) Source Database(s) Interaction Identifier(s) Interaction Identifier(s) Interaction Xref(s) Interaction Identifier(s) Interaction Annotation(s) Interaction Xref(s) Interaction Annotation(s)
fb_synonym primary_FBid organism_abbreviation primary_FBid current_symbol primary_FBid current_fullname primary_FBid fullname_synonym(s) primary_FBid symbol_synonym(s)
allele_genetic_interactions allele_symbol allele_FBal# allele_FBal# interaction allele_FBal# FBrf# interaction FBrf#
fbrf_pmid_pmcid_doi FBrf PMID FBrf PMCID FBrf DOI FBrf pub_type FBrf miniref FBrf pmid_added
disease_model_annotations FBgn ID Gene symbol FBgn ID HGNC ID FBgn ID DO ID FBgn ID Allele used in model (FBal ID) FBgn ID Based on orthology with (HGNC ID) FBgn ID Evidence/interacting alleles FBgn ID Reference (FBrf ID) HGNC ID DO ID HGNC ID Allele used in model (FBal ID) HGNC ID Based on orthology with (HGNC ID) HGNC ID Evidence/interacting alleles HGNC ID Reference (FBrf ID) DO qualifier DO ID DO ID Allele used in model (FBal ID) DO ID Based on orthology with (HGNC ID) DO ID Evidence/interacting alleles DO ID Reference (FBrf ID) Allele used in model (FBal ID) Allele used in model (symbol) Allele used in model (FBal ID) Based on orthology with (HGNC ID) Allele used in model (FBal ID) Evidence/interacting alleles Allele used in model (FBal ID) Reference (FBrf ID) Based on orthology with (HGNC ID) Based on orthology with (symbol) Based on orthology with (HGNC ID) Evidence/interacting alleles Based on orthology with (HGNC ID) Reference (FBrf ID)
fbal_to_fbgn AlleleID AlleleSymbol AlleleID GeneID GeneID GeneSymbol
gene_association DB DB_Object_ID DB_Object_ID DB_Object_Symbol DB_Object_ID Qualifier DB_Object_ID GO ID DB_Object_ID DB:Reference DB_Object_ID Evidence DB_Object_ID DB_Object_Name DB_Object_ID DB_Object_Synonym DB_Object_ID DB_Object_Type DB_Object_ID Assigned_by Qualifier GO ID GO ID Evidence GO ID With (or) From
39 changes: 39 additions & 0 deletions data/readme.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<!DOCTYPE html>
<html>
<head>
<title>Dataset Directory</title>
</head>
<body>

This directory hosts a variety of data files and subdirectories related to the VersionSpace-in-MeTTa project

<h2>Combined Files</h2>
<ul>
<li><a href="whole_flybase.datalog.metta">whole_flybase.datalog.metta</a><a href="whole_flybase.datalog..mettagz">(.gz)</a> - (4.1G/382mb): The Combination of all Flybase Precomputed Files in MeTTa</li>
<li><a href="whole_flybase.datalog">whole_flybase.datalog</a><a href="whole_flybase.datalog.gz">(.gz)</a> - (4.4G/392mb): Datalog file for the above. </li>
<li><a href="whole_flybase.qlf">whole_flybase.qlf</a><a href="whole_flybase.qlf.gz">(.gz)</a> - (2.0G/364mb): Quick Load Format - Loads all of the above in under 30 seconds!<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
(In both the offical rust version and the MeTTaLog version) </li>
</ul>
<h2>Contents</h2>
<ul>
<li><a href="ftp.flybase.org/releases/FB2023_05/precomputed_files/">Individual MeTTa Files</a> Contains the individualized breakdowns of the above files from Flybase precomputed release 2023_5 </li>

<li><a href="supplimental/">Supplemental</a> - Extra context or information used in our dataset.</li>
<ul>
<li><a href="./supplimental/12_ontologies/">12_ontologies</a> - A collection of 12 different ontologies. (contains the individualized breakdowns of the above files)</li>

<li><a href="./supplimental/public.pub.tsv">public.pub.tsv</a> - Tab-separated values file of public publications.</li>
</ul>
<li><a href="extra-supplimental/">Extra-Supplemental</a> - Additional supplemental files optional for us to load on demand.</li>
<li><a href="tsv_exports/">TSV Exports</a> - Entire Chado SQL Database in TSV format.</li>
<li><a href="mozi.ai/">Mozi AI</a> - AI models and datasets from mozi.ai.</li>
<li><a href="archive/">Archive</a> - Historical data and previous versions.</li>
</ul>


</body>
</html>


73 changes: 73 additions & 0 deletions data/unzip_or_redownload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash

download_root_directory="."

echo "Starting script in the download root directory: $download_root_directory"

# Function to redownload a file
redownload() {
relative_path="${1#$download_root_directory/}"
echo "Redownloading $relative_path..."
wget -m "http://$relative_path"
echo "Redownload completed for $relative_path"
}

# Navigate to the root directory of the wget download
cd "$download_root_directory"

echo "Searching for .gz files to verify and potentially redownload..."

# Find and perform initial checks on each .gz file
find . -type f -name "*.gz" -exec sh -c '
file="$1"
uncompressed_file="${file%.gz}"
if [ -f "$uncompressed_file" ]; then
# echo "Uncompressed file already exists, verifying size: $file"
original_size=$(stat -c%s "$file")
decompressed_size=$(stat -c%s "$uncompressed_file")
if [ "$decompressed_size" -le "$original_size" ]; then
echo "Warning: Decompressed file is not larger than the original: $file"
rm "$uncompressed_file"
echo "Deleted potentially corrupted uncompressed file: $uncompressed_file"
redownload "$file"
else
echo "Size verification successful for: $file"
fi
else
echo "File needs to be decompressed: $file"
fi
' _ {} \;

echo "Decompressing and extracting files where necessary..."

# Decompress and possibly extract tar files
find . -type f -name "*.gz" -exec sh -c '
file="$1"
uncompressed_file="${file%.gz}"
directory="$(dirname "$uncompressed_file")"
tar_file_name="$(basename "$uncompressed_file")"
extraction_directory="${directory}/${tar_file_name%.tar}"
# Decompress .gz files
if [ ! -f "$uncompressed_file" ]; then
echo "Decompressing file: $file"
gunzip -k "$file"
echo "Decompression completed for: $file"
fi
# Create a new directory below the current one and extract tar files into it
if [ -f "$uncompressed_file" ] && [ "${uncompressed_file##*.}" = "tar" ]; then
echo "Creating directory for extraction: $extraction_directory"
mkdir -p "$extraction_directory"
echo "Extracting tar file: $tar_file_name into $extraction_directory"
tar -xf "$uncompressed_file" -C "$extraction_directory"
echo "Extraction completed for tar file: $tar_file_name into $extraction_directory"
fi
' _ {} \;

echo "Script execution completed."



4 changes: 4 additions & 0 deletions data/whole_header.metta.datalog
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@


:- style_check(-discontiguous).

Loading

0 comments on commit 5ad5524

Please sign in to comment.