-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added test script, works well (v0.1)
- Loading branch information
Alex Dickson
committed
May 19, 2015
1 parent
935e752
commit 7e75e23
Showing
7 changed files
with
714 additions
and
132 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,319 @@ | ||
#!/usr/bin/env perl | ||
|
||
use strict; | ||
|
||
use lib "/Users/alexrd/perl"; | ||
use mj qw ( %names %symb ); | ||
|
||
my $pdbfile = shift(@ARGV); | ||
my $contactfile = shift(@ARGV); | ||
my $seqfile = shift(@ARGV); | ||
|
||
if (!defined $contactfile) { | ||
$contactfile = "contact_freq.dat"; | ||
} | ||
|
||
if (!defined $seqfile) { | ||
$seqfile = "clustal_frag_align.dat"; | ||
} | ||
|
||
|
||
sub help { | ||
print(" | ||
colorpdb.pl | ||
Usage: ./colorpdb.pl pdb_file contact_table (default: contact_freq.dat) sequence_align_file (default: clustal_frag_align.dat) | ||
Alex Dickson | ||
University of Michigan | ||
This script uses a specified pdb as a template, and creates a set of pdbs with | ||
contact frequency information in the beta factor column. Given NAME.pdb, it will output: | ||
NAME_prot.pdb | ||
NAME_phob.pdb | ||
NAME_phil.pdb | ||
NAME_water.pdb | ||
which show the frequency of contacts with all protein amino acids, hydrophobic amino | ||
acids, hydrophilic amino acids, and water, respectively. | ||
This also prints out pdbs that are colored by sequence conservation: | ||
NAME_cons_exact.pdb -- printing the frequency of exact matches | ||
NAME_cons_type.pdb -- printing the frequency of matches by type (hydrophobic, aromatic, +ve, -ve, etc.) | ||
It uses the both the NAME.pdb file in the current directory, as well as the aligned pdb | ||
file in the seqalign/ directory to get the amino acid numbering after alignment. | ||
"); | ||
exit(0); | ||
} | ||
|
||
my $flag = shift(@ARGV); | ||
if ($flag eq "-h" || $flag eq "-help") { | ||
&help; | ||
} | ||
|
||
if (!-f $pdbfile) { | ||
print("Error: pdbfile does not exist!\n"); | ||
&help; | ||
} | ||
|
||
(my $pdb) = ($pdbfile =~ m/([0-9A-Za-z]+)\.pdb/); | ||
my $tmp = `ls seqalign/${pdb}*`; | ||
my @alignfiles = split(/\n/,$tmp); | ||
|
||
if (scalar @alignfiles == 0) { | ||
print("Error: no aligned files in the seqalign/ directory!\n"); | ||
&help; | ||
} | ||
|
||
my %alignseq; | ||
# read alignfile | ||
if (!-f $seqfile) { | ||
print("Error! seqfile $seqfile is not defined!\n"); | ||
&help; | ||
} | ||
open(MYIN,$seqfile); | ||
while (my $line = <MYIN>) { | ||
chomp($line); | ||
my @arr = split(/\s+/,$line); | ||
if (defined $arr[0] && $arr[0] ne "CLUSTAL") { | ||
$alignseq{$arr[0]} .= "$arr[1]"; | ||
} | ||
} | ||
close(MYIN); | ||
|
||
(my $exactref, my $typeref) = &getcons(\%alignseq); | ||
my @match_exact = @{$exactref}; | ||
my @match_type = @{$typeref}; | ||
|
||
my @labels; | ||
my %resinfo; | ||
my $firstres = undef; | ||
# read in the table | ||
open(MYIN,"$contactfile"); | ||
my $line = <MYIN>; | ||
chomp($line); | ||
my @labels = split(/ /,$line); | ||
while ($line = <MYIN>) { | ||
chomp($line); | ||
my @arr = split(/ /,$line); | ||
if (!defined $firstres) { | ||
$firstres = $arr[0]; | ||
} | ||
for my $i (1..@arr-1) { | ||
$resinfo{$arr[0]}{$labels[$i]} = $arr[$i]; | ||
} | ||
} | ||
|
||
my @tk = keys %alignseq; | ||
my $l = length($alignseq{$tk[0]}); | ||
my $lastres = $firstres + $l - 1; | ||
|
||
# get avg properties: prot, phob, phil, water | ||
my @phob = qw (val leu ile phe met); | ||
my @phil = qw (asp asn glu gln ser lys arg his thr); | ||
my @others = qw (gly ala cys tyr trp pro); | ||
for my $res (keys %resinfo) { | ||
$resinfo{$res}{"phob"} = 0; | ||
$resinfo{$res}{"prot"} = 0; | ||
$resinfo{$res}{"phil"} = 0; | ||
$resinfo{$res}{"water"} = 0; | ||
for my $lab (keys %{$resinfo{$res}}) { | ||
for (@phob) { | ||
if ($lab eq $_) { | ||
$resinfo{$res}{"phob"} += $resinfo{$res}{$_}; | ||
$resinfo{$res}{"prot"} += $resinfo{$res}{$_}; | ||
} | ||
} | ||
|
||
for (@phil) { | ||
if ($lab eq $_) { | ||
$resinfo{$res}{"phil"} += $resinfo{$res}{$_}; | ||
$resinfo{$res}{"prot"} += $resinfo{$res}{$_}; | ||
} | ||
} | ||
|
||
for (@others) { | ||
if ($lab eq $_) { | ||
$resinfo{$res}{"prot"} += $resinfo{$res}{$_}; | ||
} | ||
} | ||
|
||
if ($lab eq "hoh") { | ||
$resinfo{$res}{"water"} += $resinfo{$res}{$_}; | ||
} | ||
} | ||
} | ||
|
||
# read through pdb, printing out new ones as you go | ||
if (!-d "color") { | ||
mkdir("color"); | ||
} | ||
for my $i (0..@alignfiles-1) { | ||
(my $alignchain) = ($alignfiles[$i] =~ m/([A-Z])\.pdb/); | ||
open(MYIN,"$alignfiles[$i]"); | ||
my $out = $alignfiles[$i]; | ||
$out =~ s/seqalign/color/; | ||
my $phobout = $out; | ||
$phobout =~ s/align/phob/; | ||
my $philout = $out; | ||
$philout =~ s/align/phil/; | ||
my $protout = $out; | ||
$protout =~ s/align/prot/; | ||
my $waterout = $out; | ||
$waterout =~ s/align/water/; | ||
my $exactout = $out; | ||
$exactout =~ s/align/match_exact/; | ||
my $typeout = $out; | ||
$typeout =~ s/align/match_type/; | ||
open(PHOBOUT,">$phobout"); | ||
open(PHILOUT,">$philout"); | ||
open(PROTOUT,">$protout"); | ||
open(WATEROUT,">$waterout"); | ||
open(EXACTOUT,">$exactout"); | ||
open(TYPEOUT,">$typeout"); | ||
|
||
while (my $line = <MYIN>) { | ||
if (substr($line,0,4) eq "ATOM" || substr($line,0,6) eq "HETATM") { | ||
my $chain = substr($line,21,1); | ||
my $resnum = substr($line,22,4); | ||
$resnum =~ s/\s+//; | ||
|
||
if ($chain eq "$alignchain" && exists $resinfo{$resnum}) { | ||
my $tline = $line; | ||
substr($tline,55,5,sprintf("%5.2f",$resinfo{$resnum}{"phob"})); | ||
print(PHOBOUT "$tline"); | ||
|
||
$tline = $line; | ||
substr($tline,55,5,sprintf("%5.2f",$resinfo{$resnum}{"phil"})); | ||
print(PHILOUT "$tline"); | ||
|
||
$tline = $line; | ||
substr($tline,55,5,sprintf("%5.2f",$resinfo{$resnum}{"prot"})); | ||
print(PROTOUT "$tline"); | ||
|
||
$tline = $line; | ||
substr($tline,55,5,sprintf("%5.2f",$resinfo{$resnum}{"water"})); | ||
print(WATEROUT "$tline"); | ||
|
||
$tline = $line; | ||
substr($tline,55,5,sprintf("%5.2f",$match_exact[$resnum-$firstres])); | ||
print(EXACTOUT "$tline"); | ||
|
||
$tline = $line; | ||
substr($tline,55,5,sprintf("%5.2f",$match_type[$resnum-$firstres])); | ||
print(TYPEOUT "$tline"); | ||
|
||
} else { | ||
substr($line,55,5,sprintf("%5.2f",0)); | ||
print(PHOBOUT "$line"); | ||
print(PHILOUT "$line"); | ||
print(PROTOUT "$line"); | ||
print(WATEROUT "$line"); | ||
print(EXACTOUT "$line"); | ||
print(TYPEOUT "$line"); | ||
} | ||
} | ||
} | ||
close(MYIN); | ||
close(PHOBOUT); | ||
close(PHILOUT); | ||
close(PROTOUT); | ||
close(WATEROUT); | ||
close(EXACTOUT); | ||
close(TYPEOUT); | ||
|
||
# visualize first file | ||
my $vmd = "/Developer/Applications/VMD\\ 1.9.1.app/Contents/MacOS/startup.command"; | ||
system("$vmd -e viscolors.tcl -args $phobout $philout $protout $waterout $exactout $typeout $firstres $lastres $alignchain"); | ||
} | ||
|
||
|
||
|
||
|
||
sub getcons { | ||
my %seq = %{@_[0]}; | ||
my @keys = keys %seq; | ||
|
||
my $n = length($seq{$keys[0]}); | ||
|
||
my @exact; | ||
my @type; | ||
|
||
for my $i (0..$n-1) { | ||
my %aas; | ||
my %aas_type; | ||
|
||
my $nused = 0; | ||
|
||
for my $key (@keys) { | ||
my $aa = substr($seq{$key},$i,1); | ||
if ($aa ne "-") { | ||
my $type = &gettype($aa); | ||
if (!exists $aas{$aa}) { | ||
$aas{$aa} = 0; | ||
} | ||
$aas{$aa}++; | ||
if (!exists $aas_type{$type}) { | ||
$aas_type{$type} = 0; | ||
} | ||
$aas_type{$type}++; | ||
$nused++; | ||
} | ||
} | ||
|
||
my $norm = $nused*($nused-1); | ||
|
||
if ($norm > 0) { | ||
my $sum = 0; | ||
for (keys %aas) { | ||
$sum += ($aas{$_}-1)*$aas{$_}; | ||
} | ||
$exact[$i] = $sum/$norm; | ||
|
||
$sum = 0; | ||
for (keys %aas_type) { | ||
$sum += ($aas_type{$_}-1)*$aas_type{$_}; | ||
} | ||
$type[$i] = $sum/$norm; | ||
} else { | ||
$exact[$i] = 1; | ||
$type[$i] = 1; | ||
} | ||
} | ||
|
||
return(\@exact,\@type); | ||
} | ||
|
||
sub gettype { | ||
|
||
# uses optimal amino acid grouping predicted by maximum variance (see Table I of Wrabl and Grishin, Proteins, (61) 523-534, 2005) | ||
|
||
my $aa = lc(shift); | ||
if ("wfy" =~ m/$aa/) { | ||
return 0; | ||
} elsif ("mliv" =~ m/$aa/) { | ||
return 1; | ||
} elsif ("c" =~ m/$aa/) { | ||
return 2; | ||
} elsif ("g" =~ m/$aa/) { | ||
return 3; | ||
} elsif ("p" =~ m/$aa/) { | ||
return 4; | ||
} elsif ("ats" =~ m/$aa/) { | ||
return 5; | ||
} elsif ("nde" =~ m/$aa/) { | ||
return 6; | ||
} elsif ("hqrk" =~ m/$aa/) { | ||
return 7; | ||
} else { | ||
die("Error! amino acid $aa has no type"); | ||
} | ||
|
||
|
||
|
||
} | ||
|
||
|
Oops, something went wrong.