forked from gustavo11/GFFLib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgff_gene_cluster_based_on_genomic_location.pl
executable file
·144 lines (87 loc) · 2.96 KB
/
gff_gene_cluster_based_on_genomic_location.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/bin/env perl
use strict;
use FindBin;
use lib $FindBin::Bin;
use GFFFile;
use GFFUtils;
my $debug = 0;
# Define cluster of genes based on their location in the genome (gene locus).
# Genes that overlap are considered members of the same cluster
# The gene with the longest CDS is chosen as the representative of the cluster
# All representatives are reported in the GFF OUT file
my $usage = "$0 <GFF IN> <GFF OUT with one representative for each cluster > \n\n";
die $usage if scalar(@ARGV) != 2;
my $gff_filename_in = $ARGV[0];
my $out_gff = $ARGV[1];
my $gff_handler = GFFFile::new( $gff_filename_in );
$gff_handler->read();
my $gffGenes = $gff_handler->get_genes_hash();
# Ordering genes based on template name and start coord
my @gffGenesArray = values %{$gffGenes};
GFFUtils::sort_gene_arrays( \@gffGenesArray, 0 );
# As genes get added to the cluster the program adjust those variables
# They represent left and right end of the range spanning all genes in
# the clusters
my $cluster_start = -1;
my $cluster_end = -1;
my $last_gene_chrom;
my @cluster;
open OUT, ">$out_gff" or die "Unable to open file $out_gff to write\n";
for my $currGene (@gffGenesArray) {
my $gene_id = $currGene->get_id();
my $gffTranscripts = $currGene->get_transcripts_hash();
my $start = $currGene->get_start();
my $end = $currGene->get_end();
my $chrom = $currGene->get_chrom();
print STDERR "$gene_id...\n" if $debug;
if( $start > $end ){
my $temp = $start;
$start = $end;
$end = $temp;
}
# Start new clusters if no intersection
# or if chromosome have changed
if ( $start > $cluster_end || $chrom ne $last_gene_chrom ){
# If previous cluster not empty, choose the representative
# gene and dump it
if( scalar ( @cluster ) != 0 ){
my $chosen_gene = choose_representative( \@cluster );
print OUT $chosen_gene->toGFF();
undef @cluster;
$cluster_start = -1;
$cluster_end = -1;
}
$cluster_start = $start;
}
# Extend end of the cluster if needed
$cluster_end = $end if $end > $cluster_end;
push @cluster, $currGene;
$last_gene_chrom = $chrom;
}
my $chosen_gene = choose_representative( \@cluster );
print OUT $chosen_gene->toGFF();
close(OUT);
exit(0);
sub choose_representative{
my ($clusterRef) = @_;
my $chosen;
my $longest_cds = 0;
foreach my $currGene ( @{$clusterRef} ){
my $id = $currGene->get_id();
my $cds_length = 0;
my $gffTranscripts = $currGene->get_transcripts_hash();
for my $currTranscript ( values %{$gffTranscripts} ) {
my $temp_cds_length = $currTranscript->get_CDS_length();
$cds_length = $temp_cds_length if $temp_cds_length > $cds_length;
}
print STDERR "ID: $id CDS_LENGTH: $cds_length\n";
if( $cds_length > $longest_cds ){
$chosen = $currGene;
$longest_cds = $cds_length;
}
}
my $chosen_id = $chosen->get_id();
print STDERR "CHOSEN ID: $chosen_id\n";
print STDERR "======================\n";
return $chosen;
}