-
Notifications
You must be signed in to change notification settings - Fork 3
/
VAMP-seq_analysis.Rmd
1766 lines (1468 loc) · 154 KB
/
VAMP-seq_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "VAMP-seq analysis for PTEN and TPMT"
author: "Kenneth A. Matreyek"
date: "2/20/2018"
output: html_document
---
# Analysis setup
This R markdown file will go through the processes of analyzing the data for the VAMP-seq manuscript, using 4 files as the starting point. These files are the PTEN and TPMT variant score data tables, as well as the PTEN and TPMT positional score data tables. These files were provided as supplementary data files from the journal, and can also be obtained at our GitHub repo. Please place these files in the same directory as this R Markdown file to allow the analyses to proceed.
Furthermore, create a directory titled "Output" in the same directory as this R Markdown file to create the graphics files used to generate the figures.
```{r Get everything from the four supplementary tables}
knitr::opts_chunk$set(echo = TRUE)
rm(list = ls())
graphics.off()
markdown_directory <- getwd()
set.seed(12345)
pten_variants <- read.table(file = "PTEN_variant_data.tsv", stringsAsFactors = FALSE, sep = "\t", header = TRUE)
tpmt_variants <- read.table(file = "TPMT_variant_data.tsv", stringsAsFactors = FALSE, sep = "\t", header = TRUE)
pten_positions <- read.table(file = "PTEN_positional_data.tsv", stringsAsFactors = FALSE, sep = "\t", header = TRUE)
tpmt_positions <- read.table(file = "TPMT_positional_data.tsv", stringsAsFactors = FALSE, sep = "\t", header = TRUE)
```
Also, load the necessary packages, and set up the default theme that will be used in all plots
```{r setup, include=FALSE, warning = FALSE}
if(!require(ggplot2)){install.packages("ggplot2")}
library(ggplot2)
if(!require(plyr)){install.packages("plyr")}
library(plyr)
if(!require(reshape)){install.packages("reshape")}
library(reshape)
if(!require(data.table)){install.packages("data.table")}
library(data.table)
if(!require(scales)){install.packages("scales")}
library(scales)
if(!require(GGally)){install.packages("GGally")}
library(GGally)
if(!require(ggdendro)){install.packages("ggdendro")}
library(ggdendro)
if(!require(grid)){install.packages("grid")}
library(grid)
if(!require(MASS)){install.packages("MASS")}
library(MASS)
## The package versions used by the author
version #R version 3.4.4
paste("ggplot2 version:",packageVersion("ggplot2")) #‘2.2.1’
paste("plyr version:",packageVersion("plyr")) #‘1.8.4’
paste("reshape version:",packageVersion("reshape")) #‘0.8.7’
paste("data.table version:",packageVersion("data.table")) #‘1.10.4.3’
paste("scales version:",packageVersion("scales")) #‘0.5.0’
paste("GGally version:",packageVersion("GGally")) #‘1.3.2’
paste("ggdendro version:",packageVersion("ggdendro")) #‘0.1.20’
paste("grid version:",packageVersion("grid")) #‘3.4.4’
paste("MASS version:",packageVersion("MASS")) #‘7.3.49’
theme_new <- theme_set(theme_bw())
# Original theme (has gridlines)
theme_new <- theme_update(panel.grid.major = element_line(colour = "grey85"),panel.grid.minor = element_line(colour = "grey98"))
```
Here, we are also changing the factor levels of the amino acids to reflect a more informative order:
```{r Changing the default factor levels for amino acids}
pten_positions$start <- factor(pten_positions$start, levels = c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X"))
pten_variants$start <- factor(pten_variants$start, levels = c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X"))
pten_variants$end <- factor(pten_variants$end, levels = c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X"))
tpmt_positions$start <- factor(tpmt_positions$start, levels = c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X"))
tpmt_variants$start <- factor(tpmt_variants$start, levels = c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X"))
tpmt_variants$end <- factor(tpmt_variants$end, levels = c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X"))
```
Here are some basic stats on the abundance classes for the PTEN and TPMT datatables we just imported:
(note: variants with single amino acid substitutions were referred to as "missense" in the "class" column, though true missense variants achievable through single nucleotide variation are a subset of these with 1 noted in the "snv" column)
```{r PTEN basic stats, echo = FALSE}
paste("We quantified the abundance of", nrow(subset(pten_variants, class == "missense" & !is.na(score))) + nrow(subset(tpmt_variants, class == "missense" & !is.na(score))), "variants of two disease-related proteins, PTEN and TPMT")
paste("Furthermore, we identified", nrow(subset(pten_variants, class == "missense" & abundance_class == "low" & !is.na(score))), "low-abundance PTEN variants that could be pathogenic and", nrow(subset(tpmt_variants, class == "missense" & abundance_class == "low" & !is.na(score))), "low-abundance TPMT variants that could alter drug metabolism")
paste("There are ",nrow(subset(pten_variants, class == "synonymous" & !is.na(score))), " synonymous PTEN variants with abundance scores",sep="")
paste("There are ",nrow(subset(pten_variants, class == "nonsense" & !is.na(score))), " nonsense PTEN variants with abundance scores",sep="")
paste("There are ",nrow(subset(pten_variants, class == "missense" & !is.na(score))), " single amino acid PTEN variants with abundance scores",sep="")
paste("There are ",nrow(subset(pten_variants, class == "missense" & !is.na(score) & snv == 1)), " missense (possible by SNV) PTEN variants with abundance scores",sep="")
paste("There are ",nrow(subset(tpmt_variants, class == "synonymous" & !is.na(score))), " synonymous TPMT variants with abundance scores",sep="")
paste("There are ",nrow(subset(tpmt_variants, class == "nonsense" & !is.na(score))), " nonsense TPMT variants with abundance scores",sep="")
paste("There are ",nrow(subset(tpmt_variants, class == "missense" & !is.na(score))), " single amino acid TPMT variants with abundance scores",sep="")
paste("There are ",nrow(subset(tpmt_variants, class == "missense" & !is.na(score) & snv == 1)), " missense (possible by SNV) TPMT variants with abundance scores",sep="")
```
# Initial analyses: VAMP-seq assay performance
To validate that we observe different EGFP:mCherry ratios depending on the fused variant, we used EGFP with WT or uknown unstable variants, as well as numerous randomly picked variants expected to span the range of the ratio. The geometric mean of the distribution of ratios for each variant was used for the comparison.
Comparison of PTEN VAMP-seq scores with individually assessed EGFP:mCherry variants (WT-normalized geometric means plotted)
```{r pten_individual graph}
pten_variants_individual <- subset(pten_variants, !is.na(egfp_geomean))
pten_stability_control_variants <- c("WT","Y68H","C136R","D252G","L345Q","C124S")
pten_variants_individual$variant <- factor(pten_variants_individual$variant, levels = c(pten_stability_control_variants,pten_variants_individual$variant[!(pten_variants_individual$variant %in% pten_stability_control_variants)]))
PTEN_individual_plot <- ggplot() + geom_errorbar(data = pten_variants_individual, aes(x = variant, ymin = egfp_geomean_lower_ci, ymax = egfp_geomean_upper_ci), color = "black") +
geom_point(data = pten_variants_individual, aes(x = variant, y = egfp_geomean), color = "black", size = 2) +
geom_point(data = pten_variants_individual, aes(x = variant, y = egfp_geomean), color = "red", size = 1) +
scale_y_continuous(breaks = c(0,0.25,0.5,0.75,1,1.25,1.5)) + theme(axis.text.x = element_text(angle = -90, vjust = 0.5, hjust = 0)) +
xlab(NULL) + ylab("Mean WT-normalized\nEGFP / mCherry ratio")
ggsave(file = "Output/PTEN_individual_plot.pdf", PTEN_individual_plot, height = 3, width = 5)
PTEN_individual_plot
```
Comparison of TPMT VAMP-seq scores with individually assessed EGFP:mCherry variants (WT-normalized geometric means plotted)
```{r tpmt individual}
tpmt_individual <- subset(tpmt_variants, !is.na(single_WTNorm))[,c("variant","single_WTNorm","single_StErr")]
tpmt_individual <- rbind(tpmt_individual, c("WT","1","0"))
colnames(tpmt_individual) <- c("variant","mean","se")
tpmt_individual$mean <- as.numeric(tpmt_individual$mean)
tpmt_individual$se <- as.numeric(tpmt_individual$se)
tpmt_individual$lower_ci <- tpmt_individual$mean - qnorm(0.975) * tpmt_individual$se
tpmt_individual$upper_ci <- tpmt_individual$mean + qnorm(0.975) * tpmt_individual$se
tpmt_individual$position <- gsub("[^0-9]", "", tpmt_individual$variant)
tpmt_individual[tpmt_individual$position == "","position"] <- 0
tpmt_individual$position <- as.numeric(tpmt_individual$position)
tpmt_stability_control_variants <- c("WT","A80P","A154T","Y240C")
tpmt_individual$variant <- factor(tpmt_individual$variant, levels = c(tpmt_stability_control_variants,tpmt_individual$variant[!(tpmt_individual$variant %in% tpmt_stability_control_variants)]))
TPMT_individual_plot <- ggplot() + geom_errorbar(data = tpmt_individual, aes(x = variant, ymax = upper_ci, ymin = lower_ci), color = "black") +
geom_point(data = tpmt_individual, aes(x = variant, y = mean), color = "black", size = 2) +
geom_point(data = tpmt_individual, aes(x = variant, y = mean), color = "red", size = 1) +
scale_y_continuous(breaks = c(0,0.25,0.5,0.75,1,1.25,1.5)) + theme(axis.text.x = element_text(angle = -90, vjust = 0.5, hjust = 0)) +
xlab(NULL) + ylab("Mean WT-normalized\nEGFP / mCherry ratio")
ggsave(file = "Output/TPMT_individual_plot.pdf", TPMT_individual_plot, height = 3, width = 4.5)
TPMT_individual_plot
```
Paste some stats amongst the PTEN and TPMT WTs and controls
```{r Paste some Stats amongst the PTEN and TPMT WTs and controls, echo=FALSE}
paste("PTEN WT to low abundance controls fold differene:", round(1 / mean(subset(pten_variants_individual, variant %in% c("Y68H","C124R","L345Q","D252G"))$egfp_geomean),2))
paste("TPMT WT to low abundance controls fold differene:", round(1 / mean(subset(tpmt_individual, variant %in% c("A80P","A154T","Y240C"))$mean),4))
paste("TPMT WT to low abundance control A80P fold differene:", round(1 / mean(subset(tpmt_individual, variant %in% c("A80P"))$mean),4))
paste("TPMT WT to low abundance control Y240C fold differene:", round(1 / mean(subset(tpmt_individual, variant %in% c("Y240C"))$mean),4))
```
In the case of PTEN, we also tested fusion of a shorter 15-amino acid fragment of EGFP, rather than the entire protein itself, to see if size of the fused sequence with the PTEN protein altered these ratios at all.
```{r Full EGFP and SplitGFP comparisons, echo = FALSE}
paste("PTEN full-EGFP:mCherry and splitGFP:mCherry fusion ratio correlations, Pearson's r:",round(sqrt(summary(lm(pten_variants$sgfp_geomean ~ pten_variants$egfp_geomean))$r.squared),2))
paste("PTEN full-EGFP:mCherry and splitGFP:mCherry fusion ratio correlations, Spearman's R:", round(cor(pten_variants$sgfp_geomean, pten_variants$egfp_geomean, use="pairwise.complete.obs", method = "spearman"),2))
```
```{r Print and save split-GFP plot, warning = FALSE}
PTEN_egfp_sgfp_plot <- ggplot() + geom_hline(yintercept = 1, color = "grey50") + geom_vline(xintercept = 1, color = "grey50") +
geom_abline(slope = lm(pten_variants$sgfp_geomean ~ pten_variants$egfp_geomean)$coefficient[2], intercept = lm(pten_variants$sgfp_geomean ~ pten_variants$egfp_geomean)$coefficient[1], linetype = 2, color = "grey50") +
geom_point(data = pten_variants, aes(x = egfp_geomean, y = sgfp_geomean)) +
geom_text(data = pten_variants, aes(x = egfp_geomean, y = sgfp_geomean + 0.03), label = pten_variants$variant, color = "red") +
annotate("text", x = min(subset(pten_variants, !is.na(sgfp_geomean))$egfp_geomean, na.rm = TRUE), y = max(subset(pten_variants, !is.na(sgfp_geomean))$sgfp_geomean, na.rm = TRUE) + 0.03,
label = paste("r:",round(sqrt(summary(lm(pten_variants$sgfp_geomean ~ pten_variants$egfp_geomean))$r.squared),2)), hjust = 0.1) +
annotate("text", x = min(subset(pten_variants, !is.na(sgfp_geomean))$egfp_geomean, na.rm = TRUE), y = max(subset(pten_variants, !is.na(sgfp_geomean))$sgfp_geomean - 0.1, na.rm = TRUE) + 0.03,
label = paste("rho:",round(cor(pten_variants$sgfp_geomean, pten_variants$egfp_geomean, use="pairwise.complete.obs", method = "spearman"),2)), hjust = 0.1, parse = TRUE) +
scale_x_continuous(limits = c(0,1.1), breaks = c(0,0.25,0.5,0.75,1), expand = c(0,0)) + scale_y_continuous(limits = c(0,1.1), breaks = c(0,0.25,0.5,0.75,1), expand = c(0,0)) +
xlab("WT-normalized EGFP/mCherry ratio") + ylab("WT-normalized split-EGFP/mCherry ratio")
ggsave(file = "Output/PTEN_egfp_sgfp_plot.pdf", PTEN_egfp_sgfp_plot, height = 4, width = 4)
PTEN_egfp_sgfp_plot
```
We also wanted to ensure that the replicate experiments correlated with each other. Thus, we made all-by-all comparisons of the replicate experiments for PTEN and TPMT.
```{r Getting some PTEN score correlations on, warning = FALSE}
scatterplot_correlation_fxn <- function(data, mapping, ...){
p <- ggplot(data = data, mapping = mapping) +
geom_point(alpha = 0.01) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
p
}
pten_variants_min <- pten_variants[,c("class","score1","score2","score3","score4","score5","score6","score7","score8")]
tpmt_variants_min <- tpmt_variants[,c("class","score1","score2","score3","score4","score5","score6","score7","score8")]
## PTEN experiment correlations
pten_pearson_cor_matrix <- as.matrix(cor(pten_variants_min[,c("score1","score2","score3","score4","score5","score6","score7","score8")], use="pairwise.complete.obs", method = "pearson"))
pten_spearman_cor_matrix <- as.matrix(cor(pten_variants_min[,c("score1","score2","score3","score4","score5","score6","score7","score8")], use="pairwise.complete.obs", method = "spearman"))
pten_count_matrix <- as.matrix(cor(pten_variants_min[,c("score1","score2","score3","score4","score5","score6","score7","score8")], use="pairwise.complete.obs", method = "pearson"))
pten_scatterplot_matrix_pearson <- ggpairs(pten_variants_min,columns = c("score1","score2","score3","score4","score5","score6","score7","score8"), lower = list(continuous = scatterplot_correlation_fxn), method = "pearson")
ggsave(file = "Output/PTEN_scatterplot_matrix_pearson.png", plot = pten_scatterplot_matrix_pearson, device = "png", height =7, width = 7)
pten_pearson_cor_matrix
pten_spearman_cor_matrix
pten_scatterplot_matrix_pearson
pairwise_names <- c("score1","score2","score3","score4","score5","score6","score7","score8")
pten_pairwise_count_vector <- c()
for(x in 1:(length(pairwise_names)-1)){
for(y in (x+1):length(pairwise_names)){
print(paste(x,y))
pairwise_count <- nrow(subset(pten_variants_min, !is.na(pten_variants_min[,pairwise_names[x]]) & !is.na(pten_variants_min[,pairwise_names[y]])))
pten_pairwise_count_vector <- c(pten_pairwise_count_vector,pairwise_count)
}
}
print(paste("The PTEN replicate pairwise correlations were based on n values of:",toString(pten_pairwise_count_vector)))
## TPMT experiment correlations
tpmt_pearson_cor_matrix <- as.matrix(cor(tpmt_variants_min[,c("score1","score2","score3","score4","score5","score6","score7","score8")], use="pairwise.complete.obs", method = "pearson"))
tpmt_spearman_cor_matrix <- as.matrix(cor(tpmt_variants_min[,c("score1","score2","score3","score4","score5","score6","score7","score8")], use="pairwise.complete.obs", method = "spearman"))
# Remove data from TPMT library preparations with almost nonexistant overlapping variants
tpmt_pearson_cor_matrix[1:4,5:6] <- NA; tpmt_pearson_cor_matrix[5:6,1:4] <- NA
tpmt_spearman_cor_matrix[1:4,5:6] <- NA; tpmt_spearman_cor_matrix[5:6,1:4] <- NA
tpmt_scatterplot_matrix_pearson <- ggpairs(tpmt_variants_min,columns = c("score1","score2","score3","score4","score5","score6","score7","score8"), lower = list(continuous = scatterplot_correlation_fxn), method = "pearson")
ggsave(file = "Output/TPMT_scatterplot_matrix_pearson.png", tpmt_scatterplot_matrix_pearson, height = 7, width = 7)
tpmt_pearson_cor_matrix
tpmt_spearman_cor_matrix
tpmt_scatterplot_matrix_pearson
tpmt_pairwise_count_vector <- c()
for(x in 1:(length(pairwise_names)-1)){
for(y in (x+1):length(pairwise_names)){
pairwise_count <- nrow(subset(tpmt_variants_min, !is.na(tpmt_variants_min[,pairwise_names[x]]) & !is.na(tpmt_variants_min[,pairwise_names[y]])))
tpmt_pairwise_count_vector <- c(tpmt_pairwise_count_vector,pairwise_count)
print(paste(x,y,pairwise_count))
}
}
print(paste("The TPMT replicate pairwise correlations were based on n values of:",toString(tpmt_pairwise_count_vector)))
```
```{r Print out some stats about the experimental replicate correlations, echo = FALSE}
paste("Mean PTEN replicate correlation Pearson's r:",round(mean(pten_pearson_cor_matrix[-seq(1,nrow(pten_pearson_cor_matrix)^2,nrow(pten_pearson_cor_matrix)+1)]),2))
paste("Median PTEN replicate correlation Pearson's r:",round(median(pten_pearson_cor_matrix[-seq(1,nrow(pten_pearson_cor_matrix)^2,nrow(pten_pearson_cor_matrix)+1)]),2))
paste("Mean PTEN replicate correlation Spearman's r:",round(mean(pten_spearman_cor_matrix[-seq(1,nrow(pten_spearman_cor_matrix)^2,nrow(pten_spearman_cor_matrix)+1)]),2))
paste("Median PTEN replicate correlation Spearman's r:",round(median(pten_spearman_cor_matrix[-seq(1,nrow(pten_spearman_cor_matrix)^2,nrow(pten_spearman_cor_matrix)+1)]),2))
paste("Mean TPMT replicate correlation Pearson's r:",round(mean(tpmt_pearson_cor_matrix[-seq(1,nrow(tpmt_pearson_cor_matrix)^2,nrow(tpmt_pearson_cor_matrix)+1)], na.rm = TRUE),2))
paste("Median TPMT replicate correlation Pearson's r:",round(median(tpmt_pearson_cor_matrix[-seq(1,nrow(tpmt_pearson_cor_matrix)^2,nrow(tpmt_pearson_cor_matrix)+1)], na.rm = TRUE),2))
paste("Mean TPMT replicate correlation Spearman's r:",round(mean(tpmt_spearman_cor_matrix[-seq(1,nrow(tpmt_spearman_cor_matrix)^2,nrow(tpmt_spearman_cor_matrix)+1)], na.rm = TRUE),2))
paste("Median TPMT replicate correlation Spearman's r:",round(median(tpmt_spearman_cor_matrix[-seq(1,nrow(tpmt_spearman_cor_matrix)^2,nrow(tpmt_spearman_cor_matrix)+1)], na.rm = TRUE),2))
```
# Analyzing the VAMP-seq abundance scores
With the initial quality of the data assessed, we tried to summarize how the results looked at the protein level.
A critical thresholds we repeatedly used for PTEN classification was the 5% lowest synonymous variant score. This is more stringent than using the lower-bound of the 95% confidence interval of the synonymous distribution, assuming a normal distribution.
The lowest 10% of missense variants and the median of the synonymous variants were critical values used throughout the analysis as well.
```{r import data}
pten_synonymous_5th <- quantile(subset(pten_variants, class == "synonymous")$score, 0.05)
paste("The threshold value above which the top 95% of PTEN synonymous variant scores reside, which is used in the analysis:", round(pten_synonymous_5th,2))
paste("The lower bound of the 95% confidence interval of the PTEN synonymous distribution score assuming a normal distribution:", round(mean(subset(pten_variants, class == "synonymous")$score) - qnorm(0.975) * sd(subset(pten_variants, class == "synonymous")$score)/sqrt(8-1),2))
tpmt_synonymous_5th <- quantile(subset(tpmt_variants, class == "synonymous")$score, 0.05)
paste("The threshold value above which the top 95% of TPMT synonymous variant scores reside, which is used in the analysis:", round(tpmt_synonymous_5th,2))
```
#### Plots of nonsense variant scores by position
The abundance scores of nonsense variants were plotting along the positions of the PTEN and TPMT proteins.
```{r Nonsense mutations by position, fig.height = 1.5, fig.width = 2.75, warning = FALSE}
pten_trunc_score_position_plot <- ggplot() + geom_point(data = subset(pten_variants, class == "nonsense"), aes(x = as.numeric(position), y = score), size = 0.5) +
ylab("Stability score") + xlab("Position in protein") + geom_hline(yintercept = pten_variants[pten_variants$class == "wt","score"], color = "blue") +
scale_y_continuous(limits = c(-0.5,1.3), breaks = c(-0.4,0,0.4,0.8,1.2)) + scale_x_continuous(breaks = c(0,100,200,300,400))
ggsave(file = "Output/PTEN_trunc_score_position_plot.pdf", pten_trunc_score_position_plot, height = 1.5, width = 2.25)
tpmt_trunc_score_position_plot <- ggplot() + geom_point(data = subset(tpmt_variants, class == "nonsense"), aes(x = as.numeric(position), y = score), size = 0.5) +
ylab("Stability score") + xlab("Position in protein") + geom_hline(yintercept = pten_variants[pten_variants$class == "wt","score"], color = "blue") +
scale_y_continuous(limits = c(-0.5,1.3), breaks = c(-0.4,0,0.4,0.8,1.2)) + scale_x_continuous(breaks = c(0,50,100,150,200))
ggsave(file = "Output/TPMT_trunc_score_position_plot.pdf", tpmt_trunc_score_position_plot, height = 1.5, width = 2.25)
print(pten_trunc_score_position_plot)
print(tpmt_trunc_score_position_plot)
```
### Heatmap and density plot legend for PTEN
To summarize the data, heatmaps for the abundance scores of both proteins were produced.
First, here is the heatmap for PTEN. The WT side-chains at each position are marked by the black square, and colored according to the WT score. Positions that did not have any data are colored grey. Please see the gradients used to color the single amino acid variants in the density plot created in the code chunk below this one for the color key.
```{r Big PTEN summary heatmap, echo=FALSE, fig.height = 9, fig.width = 3}
wtseq <- "MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV"
wtseqframe <- data.frame(seq(1,nchar(wtseq)))
wtseqframe$end <- rep("NA",nchar(wtseq))
for (x in 1:nchar(wtseq)){wtseqframe$end[x] <- substr(wtseq,x,x)}
wtseqframe$score <- pten_variants[pten_variants$class == "wt","score"]
colnames(wtseqframe) <- c("position","end","score")
pten_lower_bound <- as.numeric(quantile(subset(pten_variants, class == "missense")$score, 0.1, na.rm = TRUE))
position_vector <- seq(1:nchar(wtseq))
aa_vector <- c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X")
full_combinations <- as.vector(outer(position_vector, aa_vector, paste, sep=""))
missing_combinations <- data.frame(full_combinations[!(full_combinations %in% substr(subset(pten_variants, !is.na(score))$variant,2,10))]); colnames(missing_combinations) <- "variant"
missing_combinations$position <- gsub("[^0-9]","",missing_combinations$variant); missing_combinations$position <- as.numeric(missing_combinations$position)
missing_combinations$end <- gsub("[0-9]","",missing_combinations$variant); missing_combinations$end <- factor(missing_combinations$end, levels = c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X"))
pten_full_variant_heatmap <- ggplot() +
geom_tile(data = missing_combinations, aes(x = end, y = position), fill = "grey70") +
geom_tile(data = subset(pten_variants, (class == "missense" | class == "nonsense") & score > pten_lower_bound & score < 1), aes(x = end, y = position, fill = score)) +
geom_tile(data = subset(pten_variants, (class == "missense" | class == "nonsense") & score >= 1), aes(x = end, y = position, fill = score)) +
geom_tile(data = subset(pten_variants, (class == "missense" | class == "nonsense") & score <= pten_lower_bound), aes(x = end, y = position), fill = "blue") +
geom_tile(data = wtseqframe, aes(x = end, y= position, fill = score)) +
geom_point(data = wtseqframe, aes(x = end, y = position), shape = 15, size = 0.4) +
xlab(NULL) + scale_y_reverse(expand = c(0,0), breaks = c(1,seq(10,400,10))) + ylab(NULL) +
scale_fill_gradientn(colours = c("blue","white","red"), values = rescale(c(pten_lower_bound,1,2)), limits=c(pten_lower_bound,2)) +
theme(panel.background = element_rect(fill = "white"), legend.position = "none",
panel.grid.major = element_line(colour = "grey90"))
ggsave(file = "Output/PTEN_full_variant_heatmap.pdf", pten_full_variant_heatmap, height = 6, width = 3)
print(pten_full_variant_heatmap)
```
These are the separate score distributions for synonymous, nonsense, and single amino acid variants. Synonymous and nonsense variants are colored dark red and dark blue, respectively. The single amino acid variants were colored according to their score, with the single amino acids with the 10% lowest scores colored dark blue, going in a gradient toward the WT value of 1 which was colored white, with variants with scores higher than 1 going in a color gradient going toward red.
```{r PTEN density plot, fig.height = 1.5, fig.width = 3, warning = FALSE}
pten_m <- ggplot(data = subset(pten_variants, class == "missense"), aes(x = score, y = ..scaled..))
pten_m <- pten_m + geom_density()
pten_q <- ggplot_build(pten_m)$data[[1]]
pten_q_low <- subset(pten_q, x <= pten_lower_bound)
pten_q_mid <- subset(pten_q, x > pten_lower_bound & x < 1)
pten_q_high <- subset(pten_q, x >= 1)
pten_heatmap_density <- ggplot() +
geom_segment(data = pten_q_low , aes(x = x, y = y, xend = x, yend = 0), colour = "blue", size = 2) +
geom_segment(data = pten_q_mid, aes(x = x, y = y, xend = x, yend = 0, colour = x), size = 2) +
geom_segment(data = pten_q_high, aes(x = x, y = y, xend = x, yend = 0, colour = x), size = 2) +
geom_density(data = subset(pten_variants, class == "missense"), aes(x = score, y = ..scaled..), size = 1) +
scale_color_gradientn(colours = c("blue","white","red"), values = rescale(c(pten_lower_bound,1,2)), limits=c(pten_lower_bound,2)) +
scale_x_continuous(limits = c(-0.8, 2), breaks = seq(-0.4,2,0.4), expand = c(0,0)) +
scale_y_continuous(limits = c(0,1.2), breaks = c(0,0.4,0.8,1.2), expand = c(0,0.002)) + xlab("Variant abundance score") + ylab("Density") +
theme(panel.background = element_rect(fill = "white"), legend.position = "none") +
geom_density(data = subset(pten_variants, class == "synonymous"), aes(x = score, y = ..scaled..), color = "darkred", alpha = 0.4, size = 1, linetype = 2) +
geom_density(data = subset(pten_variants, class == "nonsense"), aes(x = score, y = ..scaled..), color = "darkblue", alpha = 0.4, size = 1, linetype = 2) +
geom_vline(xintercept = quantile(subset(pten_variants, class == "synonymous")$score,0.05), linetype = 2)
ggsave(file = "Output/PTEN_heatmap_density.pdf", pten_heatmap_density, height = 1.2, width = 3)
print(pten_heatmap_density)
```
#### Heatmap and density plot legend for TPMT
Analagous heatmap and density plots for TPMT. See above PTEN chunks for details.
```{r Big TPMT summary heatmap, echo=FALSE, fig.height = 9, fig.width = 3}
wtseq <- "MDGTRTSLDIEEYSDTEVQKNQVLTLEEWQDKWVNGKTAFHQEQGHQLLKKHLDTFLKGKSGLRVFFPLCGKAVEMKWFADRGHSVVGVEISELGIQEFFTEQNLSYSEEPITEIPGTKVFKSSSGNISLYCCSIFDLPRTNIGKFDMIWDRGALVAINPGDRKCYADTMFSLLGKKFQYLLCVLSYDPTKHPGPPFYVPHAEIERLFGKICNIRCLEKVDAFEERHKSWGIDCLFEKLYLLTEK"
wtseqframe <- data.frame(seq(1,nchar(wtseq)))
wtseqframe$end <- rep("NA",nchar(wtseq))
for (x in 1:nchar(wtseq)){wtseqframe$end[x] <- substr(wtseq,x,x)}
wtseqframe$score <- tpmt_variants[tpmt_variants$class == "wt","score"]
colnames(wtseqframe) <- c("position","end","score")
tpmt_lower_bound <- as.numeric(quantile(subset(tpmt_variants, class == "missense")$score, 0.1, na.rm = TRUE))
position_vector <- seq(1:nchar(wtseq))
aa_vector <- c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X")
full_combinations <- as.vector(outer(position_vector, aa_vector, paste, sep=""))
missing_combinations <- data.frame(full_combinations[!(full_combinations %in% substr(subset(tpmt_variants, !is.na(score))$variant,2,10))]); colnames(missing_combinations) <- "variant"
missing_combinations$position <- gsub("[^0-9]","",missing_combinations$variant); missing_combinations$position <- as.numeric(missing_combinations$position)
missing_combinations$end <- gsub("[0-9]","",missing_combinations$variant); missing_combinations$end <- factor(missing_combinations$end, levels = c("A","V","I","L","M","F","Y","W","S","T","N","Q","C","G","P","R","H","K","D","E","X"))
tpmt_full_variant_heatmap <- ggplot() +
geom_tile(data = missing_combinations, aes(x = end, y = position), fill = "grey70") +
geom_tile(data = subset(tpmt_variants, (class == "missense" | class == "nonsense") & score > tpmt_lower_bound & score < 1), aes(x = end, y = position, fill = score)) +
geom_tile(data = subset(tpmt_variants, (class == "missense" | class == "nonsense") & score >= 1), aes(x = end, y = position, fill = score)) +
geom_tile(data = subset(tpmt_variants, (class == "missense" | class == "nonsense") & score <= tpmt_lower_bound), aes(x = end, y = position), fill = "blue") +
geom_tile(data = wtseqframe, aes(x = end, y= position, fill = score)) +
geom_point(data = wtseqframe, aes(x = end, y = position), shape = 15, size = 0.4) +
xlab(NULL) + scale_y_reverse(expand = c(0,0), breaks = c(1,seq(10,400,10))) + ylab(NULL) +
scale_fill_gradientn(colours = c("blue","white","red"), values = rescale(c(tpmt_lower_bound,1,2)), limits=c(tpmt_lower_bound,2)) +
theme(panel.background = element_rect(fill = "white"), legend.position = "none",
panel.grid.major = element_line(colour = "grey90"))
ggsave(file = "Output/TPMT_full_variant_heatmap.pdf", tpmt_full_variant_heatmap, height = 3.6, width = 3)
print(tpmt_full_variant_heatmap)
```
Here is the TPMT density plot, with the single-amino acid variant color gradient serving as the heatmap color legend.
```{r TPMT density plot, fig.height = 1.5, fig.width = 3, warning = FALSE}
tpmt_m <- ggplot(data = subset(tpmt_variants, class == "missense"), aes(x = score, y = ..scaled..))
tpmt_m <- tpmt_m + geom_density()
tpmt_q <- ggplot_build(tpmt_m)$data[[1]]
tpmt_q_low <- subset(tpmt_q, x <= tpmt_lower_bound)
tpmt_q_mid <- subset(tpmt_q, x > tpmt_lower_bound & x < 1)
tpmt_q_high <- subset(tpmt_q, x >= 1)
tpmt_heatmap_density <- ggplot() +
geom_segment(data = tpmt_q_low, aes(x = x, y = y, xend = x, yend = 0), colour = "blue", size = 2) +
geom_segment(data = tpmt_q_mid, aes(x = x, y = y, xend = x, yend = 0, colour = x), size = 2) +
geom_segment(data = tpmt_q_high, aes(x = x, y = y, xend = x, yend = 0, colour = x), size = 2) +
geom_density(data = subset(tpmt_variants, class == "missense"), aes(x = score, y = ..scaled..), size = 1) +
scale_color_gradientn(colours = c("blue","white","red"), values = rescale(c(tpmt_lower_bound,1,2)), limits=c(tpmt_lower_bound,2)) +
scale_x_continuous(limits = c(-0.8, 2), breaks = seq(-0.4,2,0.4), expand = c(0,0)) +
scale_y_continuous(limits = c(0,1.2), breaks = c(0,0.4,0.8,1.2), expand = c(0,0.002)) + xlab("Variant abundance score") + ylab("Density") +
theme(panel.background = element_rect(fill = "white"), legend.position = "none") +
geom_density(data = subset(tpmt_variants, class == "synonymous"), aes(x = score, y = ..scaled..), color = "darkred", alpha = 0.4, size = 1, linetype = 2) +
geom_density(data = subset(tpmt_variants, class == "nonsense"), aes(x = score, y = ..scaled..), color = "darkblue", alpha = 0.4, size = 1, linetype = 2) +
geom_vline(xintercept = quantile(subset(tpmt_variants, class == "synonymous")$score,0.05), linetype = 2)
ggsave(file = "Output/TPMT_heatmap_density.pdf", tpmt_heatmap_density, height = 1.2, width = 3)
tpmt_heatmap_density
```
The single amino acid variant distributions were compared between PTEN and TPMT, to see which protein had more single amino acid variants with scores outside of the range of the synonymous distribution.
```{r PTEN and TPMT missense density comparisons}
pten_weighted_average_density_plot <- ggplot() + geom_vline(xintercept = subset(pten_variants, class == "wt")$median_w_ave, linetype = 3) + geom_density(data = subset(pten_variants, class == "synonymous"), aes(x = median_w_ave, y = ..scaled..), color = "red") + geom_density(data = subset(pten_variants, class == "nonsense" & position > 50 & position < 350), aes(x = median_w_ave, y = ..scaled..), color = "blue") + geom_density(data = subset(pten_variants, class == "missense"), aes(x = median_w_ave, y = ..scaled..), color = "black") + scale_x_continuous(limits = c(0.25, 1), expand = c(0,0)) + ylab("Density") + xlab("Median weighted average value")
ggsave(file = "Output/PTEN_weighted_ave_smoothed_histograms.pdf", pten_weighted_average_density_plot, height = 1, width = 2.5)
pten_weighted_average_density_plot
tpmt_weighted_average_density_plot <- ggplot() + geom_vline(xintercept = subset(tpmt_variants, class == "wt")$median_w_ave, linetype = 3) + geom_density(data = subset(tpmt_variants, class == "synonymous"), aes(x = median_w_ave, y = ..scaled..), color = "red") + geom_density(data = subset(tpmt_variants, class == "nonsense" & position > 50 & position < 200), aes(x = median_w_ave, y = ..scaled..), color = "blue") + geom_density(data = subset(tpmt_variants, class == "missense"), aes(x = median_w_ave, y = ..scaled..), color = "black") + scale_x_continuous(limits = c(0.25, 1), expand = c(0,0)) + ylab("Density") + xlab("Median weighted average value")
ggsave(file = "Output/TPMT_weighted_ave_smoothed_histograms.pdf", tpmt_weighted_average_density_plot, height = 1, width = 2.5)
tpmt_weighted_average_density_plot
combined_density_plot <- ggplot() + geom_density(data = subset(pten_variants, class == "missense" & !is.na(score)), aes(x = score, y = ..density../2), fill = "grey25", color = "black", alpha = 0.4) +
geom_density(data = subset(tpmt_variants, class == "missense" & !is.na(score)), aes(x = score, y = ..density../2), fill = "dark green", color = "black", alpha = 0.4) +
xlab("Abundance score") + ylab("Density") + geom_vline(xintercept = quantile(subset(pten_variants, class == "synonymous")$score,0.05), linetype = 2) + geom_vline(xintercept = quantile(subset(tpmt_variants, class == "synonymous")$score,0.05), linetype = 2, color = "dark green")
ggsave(file = "Output/Overlapping_missense_smoothed_histograms.pdf", combined_density_plot, height = 1.5, width = 2.5)
combined_density_plot
```
```{r Stats about what percentage of variants are below synonymous distributions, echo = FALSE}
paste(round(nrow(subset(pten_variants, !is.na(score) & class == "missense" & score < quantile(subset(pten_variants, class == "synonymous")$score,0.05))) / nrow(subset(pten_variants, !is.na(score) & class == "missense")) * 100,1),
" percent of PTEN missense variants have abundance scores below the synonymous distribution", sep = "")
paste(round(nrow(subset(tpmt_variants, !is.na(score) & class == "missense" & score < quantile(subset(tpmt_variants, class == "synonymous")$score,0.05))) / nrow(subset(tpmt_variants, !is.na(score) & class == "missense")) * 100,1),
" percent of TPMT missense variants have abundance scores below the synonymous distribution", sep = "")
```
To complement the VAMP-seq data summary provided by the heatmap and density plots, I also made complementary plots showing the median score at each position (thus demonstrating tolerability of each position to substitution), and the PSIC score of evolutionary conservation of each position.
```{r Getting a moving average of the positional scores, fig.width = 1, fig.height = 8.35, warning = FALSE}
ma <- function(x,n=3){filter(x,rep((1/n),n), sides=2)}
pten_positions$score_moving_average <- ma(pten_positions$score)
PTEN_positional_line_graph <- ggplot() +
geom_point(data = subset(pten_positions, variants_scored >= 5), aes(x = position, y = score), size = 0.5, color = "black") +
geom_line(data = pten_positions, aes(x = position, y = score_moving_average), color = "black", size = 0.8, alpha = 0.5) +coord_flip() +
scale_x_reverse(expand = c(0,0.05), breaks = c(1, seq(10,400,10))) + xlab(NULL) + ylab(NULL) + scale_y_continuous(breaks = c(0.24,0.5,0.75,1)) +
theme(axis.text.x = element_text(angle = -90, vjust = 0.5))
ggsave(file = "Output/PTEN_positional_line_graph.pdf", PTEN_positional_line_graph, height = 6.1, width = 1)
PTEN_positional_line_graph
tpmt_positions$score_moving_average <- ma(tpmt_positions$score)
TPMT_positional_line_graph <- ggplot() +
geom_point(data = subset(tpmt_positions, variants_scored >= 5), aes(x = position, y = score), size = 0.5, color = "black") +
geom_line(data = tpmt_positions, aes(x = position, y = score_moving_average), color = "black", size = 0.8, alpha = 0.5) +coord_flip() +
scale_x_reverse(expand = c(0,0.05), breaks = c(1, seq(10,400,10))) + xlab(NULL) + ylab(NULL) +
theme(axis.text.x = element_text(angle = -90, vjust = 0.5))
ggsave(file = "Output/TPMT_positional_line_graph.pdf", TPMT_positional_line_graph, height = 3.6, width = 1)
TPMT_positional_line_graph
pten_positions$psic_moving_average <- ma(pten_positions$AA1_psic)
PTEN_psic_line_graph <- ggplot() +
geom_point(data = pten_positions, aes(x = position, y = AA1_psic), size = 0.5, color = "black") +
geom_line(data = pten_positions, aes(x = position, y = psic_moving_average), color = "black", size = 0.8, alpha = 0.5) +coord_flip() +
scale_x_reverse(expand = c(0,0.05), breaks = c(1, seq(10,400,10))) + xlab(NULL) + ylab(NULL) +
theme(axis.text.x = element_text(angle = -90, vjust = 0.5))
ggsave(file = "Output/PTEN_psic_line_graph.pdf", PTEN_psic_line_graph, height = 6.1, width = 1)
PTEN_psic_line_graph
tpmt_positions$psic_moving_average <- ma(tpmt_positions$AA1_psic)
TPMT_psic_line_graph <- ggplot() +
geom_point(data = tpmt_positions, aes(x = position, y = AA1_psic), size = 0.5, color = "black") +
geom_line(data = tpmt_positions, aes(x = position, y = psic_moving_average), color = "black", size = 0.8, alpha = 0.5) +coord_flip() +
scale_x_reverse(expand = c(0,0.05), breaks = c(1, seq(10,400,10))) + xlab(NULL) + ylab(NULL) +
theme(axis.text.x = element_text(angle = -90, vjust = 0.5))
ggsave(file = "Output/TPMT_psic_line_graph.pdf", TPMT_psic_line_graph, height = 3.6, width = 1)
TPMT_psic_line_graph
PTEN_positional_bar_graph <- ggplot() +
geom_bar(data = pten_positions, aes(x = position, y = variants_scored), color = NA, fill = "grey70", stat = "identity") +
coord_flip() +
scale_x_reverse(expand = c(0,0.05), breaks = c(1, seq(10,400,10))) + xlab(NULL) + ylab(NULL) + scale_y_continuous(breaks = c(0,5,10,15,19), limits = c(0,19), expand = c(0,0)) +
theme(axis.text.x = element_text(angle = -90, vjust = 0.5))
ggsave(file = "Output/PTEN_positional_bar_graph.pdf", PTEN_positional_bar_graph, height = 6.1, width = 0.8)
PTEN_positional_bar_graph
TPMT_positional_bar_graph <- ggplot() +
geom_bar(data = tpmt_positions, aes(x = position, y = variants_scored), color = NA, fill = "grey70", stat = "identity") +
coord_flip() +
scale_x_reverse(expand = c(0,0.05), breaks = c(1, seq(10,400,10))) + xlab(NULL) + ylab(NULL) + scale_y_continuous(breaks = c(0,5,10,15,19), limits = c(0,19), expand = c(0,0)) +
theme(axis.text.x = element_text(angle = -90, vjust = 0.5))
ggsave(file = "Output/TPMT_positional_bar_graph.pdf", TPMT_positional_bar_graph, height = 3.6, width = 0.8)
TPMT_positional_bar_graph
```
I also made simple line graphs that show the domain architectures for PTEN and TPMT.
```{r Making a graph of the domain architecture, fig.width = 0.6, fig.height = 8.35}
PTEN_architecture_schematic <- ggplot() +
geom_rect(aes(xmin = 1, xmax = 15, ymin = 0, ymax = 1), color = NA, fill = "grey50", alpha = 0.5) +
geom_rect(aes(xmin = 15, xmax = 185, ymin = 0, ymax = 1), color = NA, fill = "#cee076", alpha = 0.5) +
geom_rect(aes(xmin = 185, xmax = 351, ymin = 0, ymax = 1), color = NA, fill = "#d28de8", alpha = 0.5) +
geom_rect(aes(xmin = 351, xmax = 400, ymin = 0, ymax = 1), color = NA, fill = "grey30", alpha = 0.5) +
geom_segment(aes(x = 1, y = 0, xend = 403), yend = 0, size = 2) +
scale_x_reverse(breaks = c(1,185,350), expand = c(0,0)) +
scale_y_continuous(breaks = NULL, expand = c(0,0)) + xlab(NULL) + ylab(NULL) + coord_flip()
ggsave(file = "Output/PTEN_architecture.pdf", PTEN_architecture_schematic, height = 6, width = 0.6)
PTEN_architecture_schematic
TPMT_architecture_schematic <- ggplot() +
geom_rect(aes(xmin = 1, xmax = 245, ymin = 0, ymax = 1), color = NA, fill = "grey50", alpha = 0.5) +
geom_segment(aes(x = 1, y = 0, xend = 245), yend = 0, size = 2) +
scale_x_reverse(breaks = c(1,245), expand = c(0,0)) +
scale_y_continuous(breaks = NULL, expand = c(0,0)) + xlab(NULL) + ylab(NULL) + coord_flip()
ggsave(file = "Output/TPMT_architecture.pdf", TPMT_architecture_schematic, height = 3.6, width = 0.6)
TPMT_architecture_schematic
```
And lastly, a line graph that shows the basic secondary structural elements observed in the pdb of PTEN and TPMT.
Blue is alpha helices, and pink is beta-strands. Black are positions resolved in the crystal structure, while grey is missing positions.
```{r Secondary structure schematic of PTEN and TPMT, fig.height = 8, fig.width = 0.6}
PTEN_dssp_schematic <- ggplot() +
geom_segment(aes(x = 1, y = 0, xend = max(pten_positions$position)), yend = 0, size = 1, color = "grey70") +
geom_point(data = subset(pten_positions, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
geom_point(data = subset(pten_positions, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
geom_point(data = subset(pten_positions, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
scale_x_reverse(breaks = c(1,185,350), expand = c(0,0)) +
scale_y_continuous(breaks = NULL, expand = c(0,0)) + xlab(NULL) + ylab(NULL) + coord_flip() +
theme(panel.border = element_blank(), axis.text.y = element_blank())
ggsave(file = "Output/PTEN_dssp_schematic.pdf", PTEN_dssp_schematic, height = 6, width = 0.4)
PTEN_dssp_schematic
tpmt_dssp_schematic <- ggplot() +
geom_segment(aes(x = 1, y = 0, xend = max(tpmt_positions$position)), yend = 0, size = 1, color = "grey70") +
geom_point(data = tpmt_positions, aes(x = position, y = 0), color = "black", size = 1.8) +
geom_point(data = subset(tpmt_positions, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
geom_point(data = subset(tpmt_positions, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
scale_x_reverse(breaks = c(1,100,200,245), expand = c(0,0)) +
scale_y_continuous(breaks = NULL, expand = c(0,0)) + xlab(NULL) + ylab(NULL) + coord_flip() +
theme(panel.border = element_blank(), axis.text.y = element_blank())
ggsave(file = "Output/TPMT_dssp_schematic.pdf", tpmt_dssp_schematic, height = 3.6, width = 0.3)
tpmt_dssp_schematic
```
# Validating the PTEN and TPMT VAMP-seq scores
To validate the accuracy of the PTEN VAMP-seq data, I took two dozen variants that spanned the range of scores observed in VAMP-seq, engineered the mutations in a plasmid containing EGFP-tagged PTEN, recombined the cells, and assessed the EGFP/mCherry ratios in the recombined cells by flow cytometry.
```{r Abundance scores compared with individual EGFP/mCherry ratio assessments, fig.height = 8, fig.width = 8, warning = FALSE}
pten_control_lm <- lm(formula = score ~ egfp_geomean_log10, data = pten_variants)
PTEN_control_scatterplot <- ggplot() + geom_abline(slope = pten_control_lm$coefficients[[2]], intercept = pten_control_lm$coefficients[[1]], linetype = 2, color = "grey50") +
geom_point(data = pten_variants, aes(x = egfp_geomean_log10, y = score)) +
annotate("text", x = pten_variants$egfp_geomean_log10, y = pten_variants$score + 0.05, label = pten_variants$variant, color = "red", size = 2) +
xlab("Log10 normalized score (wt = 0)\nof EGFP/mCherry geomean") + ylab("Abundance score") +
annotate("text", x = -1.8, y = pten_variants[pten_variants$variant == "C124S","score"] + 0.1, label = paste("n: ", nrow(subset(pten_variants, !is.na(egfp_geomean_log10) & !is.na(score))), sep = ""), parse = TRUE) +
annotate("text", x = -1.8, y = pten_variants[pten_variants$variant == "C124S","score"], label = paste("r: ", round(cor(pten_variants$score, pten_variants$egfp_geomean_log10, use="pairwise.complete.obs", method = "pearson"),2), sep = ""), parse = TRUE) +
annotate("text", x = -1.8, y = pten_variants[pten_variants$variant == "C124S","score"] - 0.1, label = paste("rho:", round(cor(pten_variants$score, pten_variants$egfp_geomean_log10, use="pairwise.complete.obs", method = "spearman"),2), sep = ""), parse = TRUE) +
scale_y_continuous(expand = c(0,0))
PTEN_control_scatterplot
ggsave(file = "Output/PTEN_validation_individual.pdf", plot = PTEN_control_scatterplot, height =3, width = 3)
```
```{r Print stats, echo = FALSE}
paste("PTEN individual validation correlation, n:", nrow(subset(pten_variants, !is.na(score) & !is.na(egfp_geomean_log10))))
paste("PTEN individual validation correlation, Pearson's R:", round(cor(pten_variants$score, pten_variants$egfp_geomean_log10, use="pairwise.complete.obs", method = "pearson"),2))
paste("PTEN individual validation correlation, Spearman's rho:", round(cor(pten_variants$score, pten_variants$egfp_geomean_log10, use="pairwise.complete.obs", method = "spearman"),2))
```
A similar analysis testing individual variants of TPMT.
```{r High-throughput scores compared to individual geometric means, fig.height = 4, fig.width = 4}
tpmt_variants[tpmt_variants$variant == "WT","single_WTNorm"] <- 1
tpmt_subset <- subset(tpmt_variants, !is.na(single_WTNorm) & !is.na(score))
tpmt_subset$egfp_geomean_log10 <- log10(tpmt_subset$single_WTNorm)
tpmt_control_lm <- lm(formula = score ~ egfp_geomean_log10, data = tpmt_subset)
paste("TPMT individual validation correlation, n:", nrow(subset(tpmt_subset, !is.na(score) & !is.na(egfp_geomean_log10))))
paste("TPMT individual validation correlation, Pearson's R:", round(cor(tpmt_subset$score, tpmt_subset$egfp_geomean_log10, use="pairwise.complete.obs", method = "pearson"),2))
paste("TPMT individual validation correlation, Spearman's rho:", round(cor(tpmt_subset$score, tpmt_subset$egfp_geomean_log10, use="pairwise.complete.obs", method = "spearman"),2))
TPMT_control_scatterplot <- ggplot() + geom_abline(slope = tpmt_control_lm$coefficients[[2]], intercept = tpmt_control_lm$coefficients[[1]], linetype = 2, color = "grey50") +
geom_point(data = tpmt_subset, aes(x = egfp_geomean_log10, y = score)) +
annotate("text", x = tpmt_subset$egfp_geomean_log10, y = tpmt_subset$score + 0.03, label = tpmt_subset$variant, color = "red", size = 2) +
xlab("Log10 normalized score (wt = 0)\nof EGFP/mCherry geomean") + ylab("Abundance score") +
annotate("text", x = min(tpmt_subset$egfp_geomean_log10) + 0.12, y = max(tpmt_subset$score), label = paste("n:", nrow(tpmt_subset), sep = ""), parse = TRUE) +
annotate("text", x = min(tpmt_subset$egfp_geomean_log10) + 0.12, y = max(tpmt_subset$score) - 0.1, label = paste("r:", round(sqrt(summary(tpmt_control_lm)$r.squared),2), sep = ""), parse = TRUE) +
annotate("text", x = min(tpmt_subset$egfp_geomean_log10) + 0.12, y = max(tpmt_subset$score) - 0.2, label = paste("rho:", round(cor(tpmt_subset$score, tpmt_subset$egfp_geomean_log10, use="pairwise.complete.obs", method = "spearman"),2), sep = ""), parse = TRUE) +
scale_y_continuous(expand = c(0,0.1)) + scale_x_continuous(expand = c(0,0.15))
TPMT_control_scatterplot
ggsave(file = "Output/TPMT_validation_individual.pdf", plot = TPMT_control_scatterplot, height =3, width = 3)
```
As another test, PTEN VAMP-seq scores were compared with abundance phenotypes observed in western blots from published papers. See supplementary table 10 for a list of variant abundance phenotypes, along with the corresponding PMID of the report.
```{r Comparing PTEN abundance scores with western blots in the literature, fig.height = 4, fig.width = 3, warning = FALSE}
pten_lit_destabilized_subset <- subset(pten_variants, !is.na(pten_variants$lit_destabilized))
paste("There are",nrow(pten_lit_destabilized_subset),"PTEN variants being compared with abundance phenotypes in the literature")
for(x in 1:nrow(pten_lit_destabilized_subset)){
if(pten_lit_destabilized_subset$lit_destabilized[x] == "no"){pten_lit_destabilized_subset$lit_destabilized[x] <- "WT-like in reports"}
if(pten_lit_destabilized_subset$lit_destabilized[x] == "conflict"){pten_lit_destabilized_subset$lit_destabilized[x] <- "Conflicting in reports"}
if(pten_lit_destabilized_subset$lit_destabilized[x] == "yes"){pten_lit_destabilized_subset$lit_destabilized[x] <- "Less present in reports"}
}
pten_lit_destabilized_subset$lit_destabilized <- factor(pten_lit_destabilized_subset$lit_destabilized, levels = c("WT-like in reports","Conflicting in reports","Less present in reports"))
pten_lit_destabilized_plot <- ggplot() +
geom_jitter(data = pten_lit_destabilized_subset, aes(x = pten_lit_destabilized_subset$lit_destabilized, y = score, color = pten_lit_destabilized_subset$lit_destabilized),size = 3, alpha = 0.5, width = 0.4) +
xlab("Western blot phenotypes in literature") + ylab("Stability score from expts") +
theme(legend.position = "none", axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + geom_hline(yintercept = pten_synonymous_5th, linetype = 2)
pten_lit_destabilized_plot
ggsave(file = "Output/PTEN_literature_stability_plot.pdf", pten_lit_destabilized_plot, height = 3.5, width = 2)
## Run a ratio permutation test to understand significance
lit_destabilized_actual_ratio <- mean(subset(pten_lit_destabilized_subset, lit_destabilized == "Less present in reports")$score, na.rm = TRUE) /
mean(subset(pten_lit_destabilized_subset, lit_destabilized == "WT-like in reports")$score, na.rm = TRUE)
iterations <- 100
lit_destabilized_starting_vector <- as.character(pten_lit_destabilized_subset$lit_destabilized)
lit_stabilized_working_matrix <- data.frame(matrix(NA,ncol = 1, nrow = length(lit_destabilized_starting_vector)))
lit_destabilized_ratio_matrix <- data.frame(matrix(NA,ncol = iterations, nrow = 1))
for(y in 1:iterations){
for(x in 1:length(lit_destabilized_starting_vector)){
lit_stabilized_working_matrix[x,1] <- sample(lit_destabilized_starting_vector, 1, replace = FALSE)
}
lit_stabilized_working_matrix <- cbind(pten_lit_destabilized_subset[,c("variant","score")], lit_stabilized_working_matrix)
colnames(lit_stabilized_working_matrix) <- c("variant","score","class")
lit_destabilized_ratio_matrix[1,y] <- mean(subset(lit_stabilized_working_matrix, class == "Less present in reports")$score, na.rm = TRUE) /
mean(subset(lit_stabilized_working_matrix, class == "WT-like in reports")$score, na.rm = TRUE)
}
paste(sum(lit_destabilized_ratio_matrix < lit_destabilized_actual_ratio),"of",iterations,"permutations were smaller")
paste("The p value by the permutation test is less than ", round((sum(lit_destabilized_ratio_matrix < lit_destabilized_actual_ratio) + 1) / (iterations + 1), 5), sep = "")
```
A similar analysis with TPMT. Please see supplementary table 10 to find a list of variants with WT-normalized published abundance phenotype, with the corresponding PMID of the report.
```{r Comparing High-throughput scores with classifications based on western blots in literature, fig.height = 4, fig.width = 3}
tpmt_lit_destabilized_subset <- subset(tpmt_variants, !is.na(tpmt_variants$published_western) & (tpmt_variants$published_western != "NaN") & !is.na(score))
paste("There are",nrow(tpmt_lit_destabilized_subset),"TPMT variants being compared with abundance phenotypes in the literature")
tpmt_lit_destabilized_plot <- ggplot() + #geom_boxplot(data = tpmt_lit_destabilized_subset, aes(x = tpmt_lit_destabilized_subset$published_western, y = score)) +
geom_point(data = tpmt_lit_destabilized_subset, aes(x = tpmt_lit_destabilized_subset$published_western, y = score), size = 3, alpha = 0.5) +
xlab("Described stability in the literature\n(western blots)") + ylab("Stability score from expts") +
theme(legend.position = "none") + geom_hline(yintercept = 0.5, linetype = 2)
tpmt_lit_destabilized_plot
ggsave(file = "Output/TPMT_literature_stability_plot.pdf", tpmt_lit_destabilized_plot, height = 2.5, width = 2.5)
```
# Biochemical analyses
This chunk of code compares how PTEN VAMP-seq abundance scores correlate with melting temperatures tested with purified PTEN protein in vitro (PMID: 25647146).
```{r Comparing PTEN biophysical stability (Melting point of purified protein), fig.width = 4, fig.height = 4, warning = FALSE}
pten_melting_temp_subset <- subset(pten_variants, !is.na(pten_variants$tm))
pten_melting_temp_lm <- lm(formula = tm ~ score, data = pten_melting_temp_subset)
melting_plot <- ggplot(data = pten_melting_temp_subset, aes(x = tm, y = score)) + geom_smooth(method = "lm", se = FALSE, color = "grey75", linetype = 2) +
geom_point() + annotate("text", x = pten_melting_temp_subset$tm, y = pten_melting_temp_subset$score + 0.05, label = pten_melting_temp_subset$variant, color = "red") +
ylab("Abundance score\n") + xlab("\nMelting temperature (Johnston et al, 2015)") + scale_x_continuous(expand= c(0.1,0.1)) +
annotate("text", x = min(pten_melting_temp_subset$tm), y = pten_variants[pten_variants$variant == "C124S","score"], label = paste("r:", round(sqrt(summary(pten_melting_temp_lm)$r.squared),2)), hjust = 0, parse = TRUE) +
annotate("text", x = min(pten_melting_temp_subset$tm), y = pten_variants[pten_variants$variant == "C124S","score"] - 0.1, label = paste("rho:", round(cor(pten_melting_temp_subset$score, pten_melting_temp_subset$tm, use="pairwise.complete.obs", method = "spearman"),2)), hjust = 0, parse = TRUE)
ggsave(file = "Output/PTEN_melting_temp_plot.pdf", melting_plot, height = 3, width = 3)
melting_plot
```
```{r Print PTEN melting temperature correlations}
paste("PTEN melting temperature and abundance score correlation, Pearson's R:", round(cor(pten_melting_temp_subset$score, pten_melting_temp_subset$tm, use="pairwise.complete.obs", method = "pearson"),2))
paste("PTEN melting temperature and abundance score correlation, Spearman's rho:", round(cor(pten_melting_temp_subset$score, pten_melting_temp_subset$tm, use="pairwise.complete.obs", method = "spearman"),2))
```
We looked for some basic patterns of similarity or difference in the tolerability of PTEN and TPMT to single amino acid subsitutions. Mutation of WT hydrophobic aromatic, methionine, or long nonpolar aliphatic amino acids produced the largest decreases in abundance for both proteins.
```{r Plots for stability effect by residue, fig.height = 3, fig.width = 2, warning = FALSE}
pten_residue_median <- ddply(subset(pten_variants, class == "missense" & !(is.na(score)))[,c("start","score")], "start", numcolwise(median))
tpmt_residue_median <- ddply(subset(tpmt_variants, class == "missense" & !(is.na(score)))[,c("start","score")], "start", numcolwise(median))
start_residue_median <- merge(pten_residue_median, tpmt_residue_median, by = "start")
colnames(start_residue_median) <- c("start","pten_residue_median","tpmt_residue_median")
starting_residue_median_scatterplot <- ggplot() + geom_point(data = start_residue_median, aes(x = pten_residue_median, y = tpmt_residue_median), alpha = 0.4) +
annotate("text", x = start_residue_median$pten_residue_median, y = start_residue_median$tpmt_residue_median + 0.015, label = start_residue_median$start) +
xlab("PTEN residue") + ylab("TPMT residue") + scale_y_continuous(limits = c(0.5, 1), breaks = seq(0.5, 1, 0.1)) + scale_x_continuous(limits = c(0.2, 1), breaks = seq(0.2, 1, 0.1)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
ggsave(file = "Output/Starting_residue_median_scatterplot.pdf", starting_residue_median_scatterplot, height = 2.5, width = 1.75)
starting_residue_median_scatterplot
pten_residue_median <- ddply(subset(pten_variants, class == "missense" & !(is.na(score)))[,c("end","score")], "end", numcolwise(median))
tpmt_residue_median <- ddply(subset(tpmt_variants, class == "missense" & !(is.na(score)))[,c("end","score")], "end", numcolwise(median))
end_residue_median <- merge(pten_residue_median, tpmt_residue_median, by = "end")
colnames(end_residue_median) <- c("end","pten_residue_median","tpmt_residue_median")
ending_residue_median_scatterplot <- ggplot() + geom_point(data = end_residue_median, aes(x = pten_residue_median, y = tpmt_residue_median), alpha = 0.4) +
annotate("text", x = end_residue_median$pten_residue_median, y = end_residue_median$tpmt_residue_median + 0.015, label = end_residue_median$end) +
xlab("PTEN residue") + ylab("TPMT residue") + scale_y_continuous(limits = c(0.58, 0.95), breaks = seq(0.6, 1, 0.1)) + scale_x_continuous(limits = c(0.5, 0.85), breaks = seq(0.5, 1, 0.1)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
ggsave(file = "Output/Ending_residue_median_scatterplot.pdf", ending_residue_median_scatterplot, height = 2.5, width = 1.75)
ending_residue_median_scatterplot
```
We then determined which features correlated with abundance score for the two different proteins using Spearman's correlation coefficients. In fact, WT amino acid hydrophobicity was negatively correlated with abundance score (Fig. 3b, WT hydroΦ), whereas mutant amino acid hydrophobicity was positively correlated with abundance score (MT hydroΦ). Conversely, mutations of WT amino acids with high relative solvent accessibility (RSA), polarity (WT Polarity), and crystal-structure temperature factor (B-factor), all features associated with polar residues present on the protein surface, were associated with high abundance scores.
```{r Looking at biophysical annotation correlations with score, warning = FALSE}
pten_variants_min <- subset(pten_variants, !is.na(score) & !is.na(rsa) & !is.na(evolutionary_coupling_avg) & class == "missense")
specific_features <- c("variant","class","score","rsa","abs_tco","kappa","alpha","phi","psi","bfactor","grantham","hydro1","hydro2","hydrodiff","vol1","vol2","voldiff","polarity1","polarity2","polaritydiff","weight1","weight2","weightdiff","crowding","AA1_PI","AA2_PI","deltaPI","AA1_psic","AA2_psic","delta_psic","evolutionary_coupling_avg")
pten_input_output <- pten_variants_min[,append(c("variant","score"),specific_features)]
pten_spearman_frame <- data.frame(matrix(nrow = length(specific_features) - 5, ncol = 2, "NA"))
colnames(pten_spearman_frame) <- c("feature","spearman_r")
pten_spearman_frame$feature <- as.character(pten_spearman_frame$feature)
pten_spearman_frame$spearman_r <- as.numeric(pten_spearman_frame$spearman_r)
y = 1
for(x in 4:length(specific_features)){
pten_spearman_frame[y,"feature"] <- specific_features[x]
pten_spearman_frame[y,"spearman_r"] <- cor(pten_input_output[,specific_features[x]], pten_input_output$score, method = "spearman")
y = y + 1
}
pten_spearman_frame$abs_spearman_r <- abs(as.numeric(pten_spearman_frame$spearman_r))
pten_spearman_frame <- pten_spearman_frame[order(-pten_spearman_frame$abs_spearman_r),]
tpmt_variants_min <- subset(tpmt_variants, !is.na(score) & !is.na(rsa) & !is.na(evolutionary_coupling_avg) & class == "missense")
specific_features <- c("variant","class","score","rsa","abs_tco","kappa","alpha","phi","psi","bfactor","grantham","hydro1","hydro2","hydrodiff","vol1","vol2","voldiff","polarity1","polarity2","polaritydiff","weight1","weight2","weightdiff","crowding","AA1_PI","AA2_PI","deltaPI","AA1_psic","AA2_psic","delta_psic","evolutionary_coupling_avg")
tpmt_input_output <- tpmt_variants_min[,append(c("variant","score"),specific_features)]
tpmt_spearman_frame <- data.frame(matrix(nrow = length(specific_features) - 5, ncol = 2, "NA"))
colnames(tpmt_spearman_frame) <- c("feature","spearman_r")
tpmt_spearman_frame$feature <- as.character(tpmt_spearman_frame$feature)
tpmt_spearman_frame$spearman_r <- as.numeric(tpmt_spearman_frame$spearman_r)
y = 1
for(x in 4:length(specific_features)){
tpmt_spearman_frame[y,"feature"] <- specific_features[x]
tpmt_spearman_frame[y,"spearman_r"] <- cor(tpmt_input_output[,specific_features[x]], tpmt_input_output$score, method = "spearman")
y = y + 1
}
tpmt_spearman_frame$abs_spearman_r <- abs(as.numeric(tpmt_spearman_frame$spearman_r))
tpmt_spearman_frame <- tpmt_spearman_frame[order(-tpmt_spearman_frame$abs_spearman_r),]
spearman_frame_merged <- merge(pten_spearman_frame[c("feature","spearman_r")], tpmt_spearman_frame[c("feature","spearman_r")], by = "feature")
colnames(spearman_frame_merged) <- c("feature","pten_spearman_r","tpmt_spearman_r")
spearman_frame_merged$mean = (spearman_frame_merged$pten_spearman_r + spearman_frame_merged$tpmt_spearman_r)/2
spearman_frame_merged$abs_spearman_r <- abs(as.numeric(spearman_frame_merged$mean))
spearman_frame_merged <- spearman_frame_merged[order(-spearman_frame_merged$abs_spearman_r),]
evolutionary_sequence <- c("AA1_psic","AA2_psic","delta_psic","evolutionary_coupling_avg")
primary_sequence <- c("hydro1","hydro2","hydrodiff","vol1","vol2","grantham","AA1_PI","AA2_PI","polarity1","polarity2","polaritydiff")
structural_sequence <- c("rsa","bfactor","crowding","abs_tco","substr_dist","alpha","phi","psi")
plotlabel_mergeframe <- subset(spearman_frame_merged, feature %in% c("rsa","bfactor","hydro2","evolutionary_coupling_avg","substr_dist","delta_psic","abs_tco","grantham","polarity1","polaritydiff","AA1_psic","AA2_psic","hydro1","hydrodiff","crowding"))
Both_spearman_features <- ggplot() + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) +
geom_point(data = subset(spearman_frame_merged, feature %in% evolutionary_sequence), aes(x = pten_spearman_r, y = tpmt_spearman_r), color = "red") +
geom_point(data = subset(spearman_frame_merged, feature %in% primary_sequence), aes(x = pten_spearman_r, y = tpmt_spearman_r), color = "cyan") +
geom_point(data = subset(spearman_frame_merged, feature %in% structural_sequence), aes(x = pten_spearman_r, y = tpmt_spearman_r), color = "blue") +
geom_text(data = plotlabel_mergeframe, aes(x = pten_spearman_r, y = tpmt_spearman_r + 0.01), label = plotlabel_mergeframe$feature, size = 2.5)
ggsave(file = "Output/Both_spearman_features.pdf", Both_spearman_features, height = 3, width = 3)
Both_spearman_features
```
Looking at how the PTEN and TPMT positional median scores correlate with secondary structure and PSIC measurements of evolutionary conservation.
```{r Getting Spearmans R for positional medians and evolutionary conservation, echo = FALSE, warning = FALSE}
pten_psic_scatterplot <- ggplot() + geom_point(data = pten_positions, aes(x = score, AA1_psic), alpha = 0.4) + xlab("PTEN abundance score") + ylab("PTEN PSIC")
ggsave("Output/PTEN_psic_scatterplot.pdf", pten_psic_scatterplot, height = 2, width = 3.5)
pten_psic_scatterplot
tpmt_psic_scatterplot <- ggplot() + geom_point(data = tpmt_positions, aes(x = score, AA1_psic), alpha = 0.4) + xlab("TPMT abundance score") + ylab("TPMT PSIC")
ggsave("Output/TPMT_psic_scatterplot.pdf", tpmt_psic_scatterplot, height = 2, width = 3.5)
tpmt_psic_scatterplot
paste("Spearman's rho for PTEN AA1_psic and score:",round(cor(pten_positions$AA1_psic, pten_positions$score, use="pairwise.complete.obs", method = "spearman"),2))
paste("Spearman's rho for TPMT AA1_psic and score:",round(cor(tpmt_positions$AA1_psic, tpmt_positions$score, use="pairwise.complete.obs", method = "spearman"),2))
```
```{r Structure score plot, warning = FALSE}
pten_secondary_structure_score_plot <- ggplot() + geom_jitter(data = subset(pten_positions, helix != 1 & sheet != 1 & !is.na(xca)), aes(x = "3. No secondary structure", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "3. No secondary structure", y = median(subset(pten_positions, helix != 1 & sheet != 1 & !is.na(xca))$score, na.rm = TRUE), ymin = median(subset(pten_positions, helix != 1 & sheet != 1 & !is.na(xca))$score, na.rm = TRUE), ymax = median(subset(pten_positions, helix != 1 & sheet != 1 & !is.na(xca))$score, na.rm = TRUE)), color = "red") +
geom_jitter(data = subset(pten_positions, helix == 1), aes(x = "1. Alpha helix", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "1. Alpha helix", y = median(subset(pten_positions, helix == 1)$score, na.rm = TRUE), ymin = median(subset(pten_positions, helix == 1)$score, na.rm = TRUE), ymax = median(subset(pten_positions, helix == 1)$score, na.rm = TRUE)), color = "red") +
geom_jitter(data = subset(pten_positions, sheet == 1), aes(x = "2. Beta sheet", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "2. Beta sheet", y = median(subset(pten_positions, sheet == 1)$score, na.rm = TRUE), ymin = median(subset(pten_positions, sheet == 1)$score, na.rm = TRUE), ymax = median(subset(pten_positions, sheet == 1)$score, na.rm = TRUE)), color = "red") +
geom_jitter(data = subset(pten_positions, is.na(xca)), aes(x = "4. Unresolved in structure", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "4. Unresolved in structure", y = median(subset(pten_positions, is.na(xca))$score, na.rm = TRUE), ymin = median(subset(pten_positions, is.na(xca))$score, na.rm = TRUE), ymax = median(subset(pten_positions, is.na(xca))$score, na.rm = TRUE)), color = "red") +
xlab(NULL) + ylab("PTEN abundance score") + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) + scale_y_continuous(breaks = seq(0,1.2,0.2)) +
annotate("text", x = "1. Alpha helix", y = 1.4, label = paste("n:",nrow(subset(pten_positions, helix == 1)))) +
annotate("text", x = "2. Beta sheet", y = 1.4, label = paste("n:",nrow(subset(pten_positions, sheet == 1)))) +
annotate("text", x = "3. No secondary structure", y = 1.4, label = paste("n:",nrow(subset(pten_positions, helix != 1 & sheet != 1 & !is.na(xca))))) +
annotate("text", x = "4. Unresolved in structure", y = 1.4, label = paste("n:",nrow(subset(pten_positions, is.na(xca)))))
ggsave("Output/PTEN_secondary_structure_score_plot.pdf", pten_secondary_structure_score_plot, height = 3.5, width = 4)
pten_secondary_structure_score_plot
tpmt_secondary_structure_score_plot <- ggplot() + geom_jitter(data = subset(tpmt_positions, helix != 1 & sheet != 1), aes(x = "3. No secondary structure", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "3. No secondary structure", y = median(subset(tpmt_positions, helix != 1 & sheet != 1)$score, na.rm = TRUE), ymin = median(subset(tpmt_positions, helix != 1 & sheet != 1)$score, na.rm = TRUE), ymax = median(subset(tpmt_positions, helix != 1 & sheet != 1)$score, na.rm = TRUE)), color = "red") +
geom_jitter(data = subset(tpmt_positions, helix == 1), aes(x = "1. Alpha helix", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "1. Alpha helix", y = median(subset(tpmt_positions, helix == 1)$score, na.rm = TRUE), ymin = median(subset(tpmt_positions, helix == 1)$score, na.rm = TRUE), ymax = median(subset(tpmt_positions, helix == 1)$score, na.rm = TRUE)), color = "red") +
geom_jitter(data = subset(tpmt_positions, sheet == 1), aes(x = "2. Beta sheet", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "2. Beta sheet", y = median(subset(tpmt_positions, sheet == 1)$score, na.rm = TRUE), ymin = median(subset(tpmt_positions, sheet == 1)$score, na.rm = TRUE), ymax = median(subset(tpmt_positions, sheet == 1)$score, na.rm = TRUE)), color = "red") +
geom_jitter(data = subset(tpmt_positions, is.na(xca)), aes(x = "4. Unresolved in structure", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "4. Unresolved in structure", y = median(subset(tpmt_positions, is.na(xca))$score, na.rm = TRUE), ymin = median(subset(tpmt_positions, is.na(xca))$score, na.rm = TRUE), ymax = median(subset(tpmt_positions, is.na(xca))$score, na.rm = TRUE)), color = "red") +
xlab(NULL) + ylab("TPMT abundance score") + scale_y_continuous(breaks = seq(0,1.2,0.2)) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) + scale_y_continuous(breaks = seq(0,1.2,0.2)) +
annotate("text", x = "1. Alpha helix", y = 1.4, label = paste("n:",nrow(subset(tpmt_positions, helix == 1)))) +
annotate("text", x = "2. Beta sheet", y = 1.4, label = paste("n:",nrow(subset(tpmt_positions, sheet == 1)))) +
annotate("text", x = "3. No secondary structure", y = 1.4, label = paste("n:",nrow(subset(tpmt_positions, helix != 1 & sheet != 1 & !is.na(xca))))) +
annotate("text", x = "4. Unresolved in structure", y = 1.4, label = paste("n:",nrow(subset(tpmt_positions, is.na(xca)))))
ggsave("Output/TPMT_secondary_structure_score_plot.pdf", tpmt_secondary_structure_score_plot, height = 3.5, width = 4)
tpmt_secondary_structure_score_plot
```
#### The following chunks of code create .pml files that can be used to visualize the abundance data directly in Pymol (just drag the pml files into Pymol)
This chunk of code imports the pdb for PTEN (1d5r), removes the water molecules, and sets the basic visual motifs. Lastly, it creates a standard vantage point used in all of the subsequent .pml files for PTEN.
```{r PTEN pymol pml export setup, echo=FALSE, fig.height = 1, fig.width = 2}
# Create a vector of commands common to all pymol scripting I'll be doing
pymol_setup <- c()
# Import the PDB and do initial setup steps
pymol_setup <- append(pymol_setup, c("reinitialize","fetch 1d5r, async = 0","bg_color white","set opaque_background, 0","hide all","set ray_trace_mode, 1"))
# Create individual objects for PTEN and the substrate
pymol_setup <- append(pymol_setup, c("create pten, not resn TLA and not resn HOH","create substrate, resn TLA","delete 1d5r","color grey80, pten","show spheres, substrate","color magenta, substrate","show sticks, substrate","show cartoon, pten","orient pten"))
# Set the view to a common useful orientation
pymol_pten_standard_view<-c("orient pten","set_view (-0.143308833,0.549923658,-0.822817922,-0.982117832,-0.181517780,0.049738724,-0.122004509,0.815239608,0.566107035,0.001983175,-0.000477783,-151.571762085,38.322704315,79.638130188,31.967512131,76.972015381,216.736602783,-20.000000000)")
```
A custom coloring dataframe was used to get the colorscales used in this analysis script and display them in the pymol representations of PTEN.
```{r PTEN colorscale for Pymol, echo = FALSE, warning = FALSE}
## [[2]] Colorscales from low to mid and [[3]] Colorscales from mid to high
pymol_colorframe <- rbind(ggplot_build(pten_heatmap_density)$data[[2]][,c("x","colour")], ggplot_build(pten_heatmap_density)$data[[3]][,c("x","colour")])
colnames(pymol_colorframe) <- c("pymol_scores","pymol_colors")
pymol_colorframe$pymol_scores <- as.numeric(pymol_colorframe$pymol_scores)
pten_positions$color <- NA
for(x in 1:nrow(subset(pten_positions, !is.na(score)))){
if(!is.na(pten_positions$score[x])){
pten_positions$color[x] <- pymol_colorframe[which.min(abs(pymol_colorframe$pymol_scores - pten_positions$score[x])),"pymol_colors"]
}}
pymol_colorframe$r <- NA
pymol_colorframe$g <- NA
pymol_colorframe$b <- NA
for(x in 1:nrow(pymol_colorframe)){
pymol_colorframe$r[x] <- col2rgb(pymol_colorframe$pymol_colors[x])[1]
pymol_colorframe$g[x] <- col2rgb(pymol_colorframe$pymol_colors[x])[2]
pymol_colorframe$b[x] <- col2rgb(pymol_colorframe$pymol_colors[x])[3]
}
pymol_color_setup_vector <- c()
for(x in 1:nrow(pymol_colorframe)){
pymol_color_setup_vector <- append(pymol_color_setup_vector, paste("set_color ",pymol_colorframe$pymol_colors[x],", [",pymol_colorframe$r[x],",",pymol_colorframe$g[x],",",pymol_colorframe$b[x],"]",sep=""))
}
pymol_color_score_vector <- c()
for(x in 1:nrow(pten_positions)){
pymol_color_score_vector <- append(pymol_color_score_vector, paste("color ",pten_positions$color[x],", resi ", pten_positions$position[x], " and pten",sep=""))
}
```
To visualize the 20% of positions least tolerant to substitution, these positions were dispalyed as a semi-transparent surface representation.
```{r PTEN pymol most destabilized positions}
# Make a new vector for the 20% lowest positional scores
pymol_lowest_scores <- c()
# Make a command that creates a new object of only the 20 % most destabilization-prone positions
pymol_lowest_scores <- append(pymol_lowest_scores, paste("create lowest_20, pten and resi ",gsub(", ","+",toString(subset(pten_positions, score < quantile(pten_positions$score, 0.2, na.rm = TRUE))$position)),"", sep= ""))
# Make the destabilized positions a semi-transparent surface representation
pymol_lowest_scores <- append(pymol_lowest_scores, c("show surface, lowest_20","set transparency, 0.5, lowest_20","orient pten"))
fileConn<-file("Output/Pymol_1A_PTEN_lowest_scores.pml")
writeLines(c(pymol_setup,pymol_color_setup_vector,pymol_color_score_vector,pymol_lowest_scores,pymol_pten_standard_view,
paste("png ",markdown_directory,"/Output/Pymol_1A_PTEN_lowest_scores.png, width=10cm, dpi=300, ray=1",sep="")
), fileConn)
close(fileConn)
```
#### Now let's see if I can combine both the low-score hbonds and salt bridges to make groupings in 3-dimensional space
First, I am selecting for PTEN positions that were expected to form hydrogen bonds or salt bridges. Here are some stats on how many of these positions existed in the PTEN crystal structure, as well as how many had (positional) low abundance scores, and shows these score distributions of these two classes of positions on a plot.
```{r PTEN looking at the number of potential polar contacts, echo = FALSE}
print(paste("There are", nrow(subset(pten_positions, schbond_unique_partners >= 1 | saltbridge_unique_partners >= 1)), "positions potentially involved in h-bond or salt bridge interactions"))
print(paste("There are", nrow(subset(pten_positions, (schbond_unique_partners > 0 | saltbridge_unique_partners > 0) & (abundance_class == "intolerant"))), "positions potentially involved in h-bond or salt bridge interactions that are intolerant to mutation"))
```
```{r making the saltbridge plot}
schbond_saltbridge_pten_low_abundance_positions <- subset(pten_positions, (saltbridge_unique_partners >= 1 | schbond_unique_partners >= 1) & abundance_class == "intolerant")
schbond_saltbridge_pten_not_low_abundance_positions <- subset(pten_positions, (saltbridge_unique_partners >= 1 | schbond_unique_partners >= 1) & (abundance_class != "intolerant" | is.na(abundance_class)))
pten_saltbridge_hbond_score_plot <- ggplot() +
geom_jitter(data = schbond_saltbridge_pten_low_abundance_positions, aes(x = "1. Substitution\nintolerant", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "1. Substitution\nintolerant", y = median(schbond_saltbridge_pten_low_abundance_positions$score, na.rm = TRUE), ymin = median(schbond_saltbridge_pten_low_abundance_positions$score, na.rm = TRUE), ymax = median(schbond_saltbridge_pten_low_abundance_positions$score, na.rm = TRUE)), color = "red") +
geom_jitter(data = schbond_saltbridge_pten_not_low_abundance_positions, aes(x = "2. Remaining", y = score), alpha = 0.2) +
geom_crossbar(aes(x = "2. Remaining", y = median(schbond_saltbridge_pten_not_low_abundance_positions$score, na.rm = TRUE), ymin = median(schbond_saltbridge_pten_not_low_abundance_positions$score, na.rm = TRUE), ymax = median(schbond_saltbridge_pten_not_low_abundance_positions$score, na.rm = TRUE)), color = "red") + annotate("text", x = "1. Substitution\nintolerant", y = 1.3, label = paste("n:",nrow(schbond_saltbridge_pten_low_abundance_positions))) +
annotate("text", x = "2. Remaining", y = 1.3, label = paste("n:",nrow(schbond_saltbridge_pten_not_low_abundance_positions))) +
xlab(NULL) + ylab("PTEN abundance score") + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ggsave("Output/PTEN_saltbridge_hbond_score_plot.pdf", pten_saltbridge_hbond_score_plot, height = 2.5, width = 1.2)
pten_saltbridge_hbond_score_plot
```
This chunk of code performs heirarchical clustering of these substitution intolerant positions containing potential hydrogen bonds and salt bridges, so find regions of the PTEN protein that may be particularly depending on these polar interactions for folding. The frequencies that these positions are observed in COSMIC are also shown as a separate plot.
```{r PTEN combining Hbonds and Saltbridges, warning = FALSE}
schbond_saltbridge_pten_positions <- subset(pten_positions, (saltbridge_unique_partners >= 1 | schbond_unique_partners >= 1) & abundance_class == "intolerant")
schbond_saltbridge_positions <- schbond_saltbridge_pten_positions[,c("sc_xca","sc_yca","sc_zca")]
rownames(schbond_saltbridge_positions) <- schbond_saltbridge_pten_positions$position
schbond_saltbridge_frame <- data.frame(matrix(ncol = nrow(schbond_saltbridge_pten_positions), nrow = nrow(schbond_saltbridge_pten_positions)))
colnames(schbond_saltbridge_frame) <- schbond_saltbridge_pten_positions$position; rownames(schbond_saltbridge_frame) <- schbond_saltbridge_pten_positions$position
schbond_saltbridge_frame2 <- dist(schbond_saltbridge_positions, method = "euclidean")
fit <- hclust(schbond_saltbridge_frame2, method="ward.D2")
dhc <- as.dendrogram(fit)
ddata <- dendro_data(dhc, type = "rectangle")
schbond_saltbridge_frame2_cut <- cutree(fit, h = 11)
schbond_saltbridge_frame2_cut <- data.frame(schbond_saltbridge_frame2_cut)
schbond_saltbridge_frame2_cut$label <- rownames(schbond_saltbridge_frame2_cut)
colnames(schbond_saltbridge_frame2_cut) <- c("group","label")
ddata_labels <- data.frame(ddata$labels)
ddata_labels <- merge(ddata_labels, schbond_saltbridge_frame2_cut, by = "label")
ddata_labels$group <- as.integer(ddata_labels$group)
ddata_labels$label <- as.character(ddata_labels$label)
ddata_labels$color_group <- 0
color_group_counter = 1
for(group_number in sort(unique(as.integer(ddata_labels$group)))){
query_group <- pten_positions[pten_positions$position %in% ddata_labels[ddata_labels$group == group_number,"label"],c("sc_xca","sc_yca","sc_zca")]
if(nrow(query_group) > 1){
query_matrix <- matrix(NA, nrow = nrow(query_group), ncol = nrow(query_group))
colnames(query_matrix) <- ddata_labels[ddata_labels$group == group_number,"label"]
rownames(query_matrix) <- ddata_labels[ddata_labels$group == group_number,"label"]
for(x in 1:nrow(query_group)){
for(y in 1:nrow(query_group)){
query_matrix[x,y] <- sqrt((query_group[x,"sc_xca"] - query_group[y,"sc_xca"])^2 + (query_group[x,"sc_yca"] - query_group[y,"sc_yca"])^2 + (query_group[x,"sc_zca"] - query_group[y,"sc_zca"])^2)
}
}
if(max(query_matrix) < 11){ddata_labels[ddata_labels$group == group_number,"color_group"] <- color_group_counter; color_group_counter = color_group_counter + 1}
}
}
ddata_labels$color_group_factor <- as.factor(ddata_labels$color_group)
sc_interaction_color_groups <- c("0"="grey50","1"="salmon","2"="brown","3"="limegreen","4"="violet","5"="orange","6"="yellow","7"="pink","8"="darkgreen")
Destabilizing_hbond_saltbridge_tree <- ggplot(segment(ddata)) +
geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) +
geom_text(data = ddata_labels, aes(x = x, y = y-0.2, label = label, hjust = 0, vjust = 0.5, angle = -90, colour = color_group_factor)) +
geom_hline(yintercept = 11, linetype = 2) + xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = NULL) + scale_y_continuous(expand = c(0,15)) + theme(axis.text.x = element_blank()) + scale_colour_manual(values = sc_interaction_color_groups)
ggsave(file = "Output/PTEN_destabilizing_hbond_saltbridge_tree.pdf", Destabilizing_hbond_saltbridge_tree, height = 3, width = 6)
Destabilizing_hbond_saltbridge_tree
ddata_labels2 <- data.frame("position" = ddata_labels$label, "group" = ddata_labels$color_group)
ddata_labels2 <- merge(ddata_labels2, pten_positions[,c("position","cosmic_count")], by = "position", all.x = TRUE)
ddata_labels2 <- ddata_labels2[order(-ddata_labels2$cosmic_count),]
ddata_labels2[is.na(ddata_labels2)] <- 0
ddata_labels2 <- ddata_labels2[order(ddata_labels2$group),]
PTEN_destabilizing_hbond_cosmic_counts <- ggplot() + geom_point(data = ddata_labels2, aes(x = group, y = cosmic_count), alpha = 0.5) +
geom_text(data = subset(ddata_labels2, cosmic_count > 2), aes(x = group, y = cosmic_count), label = subset(ddata_labels2, cosmic_count > 2)[,"position"], hjust = 0, position = position_nudge(x = 0.1)) + ylab("Frequency of mutation in COSIMC") + xlab("Cluster group of mutation intolerant positions\nlikely involved in polar contacts") +
scale_x_continuous(breaks = seq(1,8)) + scale_y_continuous(breaks = seq(0,20,2))
ggsave(file = "Output/PTEN_destabilizing_hbond_cosmic_counts.pdf", PTEN_destabilizing_hbond_cosmic_counts, height = 2.5, width = 3)
PTEN_destabilizing_hbond_cosmic_counts
PTEN_all_cosmic_counts <- ggplot() + scale_x_continuous(limit = c(0,35)) +
geom_histogram(data = pten_positions, aes(x = cosmic_count), alpha = 0.5, binwidth = 1) +
geom_text(data = subset(pten_positions, cosmic_count > 7 & (saltbridge_unique_partners >= 1 | schbond_unique_partners >= 1) & abundance_class == "intolerant" & ! position %in% c(252,277)), aes(x = cosmic_count, y = 15, label = position, angle = 45), color = "red") +
geom_text(data = subset(pten_positions, cosmic_count > 7 & (saltbridge_unique_partners >= 1 | schbond_unique_partners >= 1) & abundance_class == "intolerant" & position %in% c(252,277)), aes(x = cosmic_count, y = 25, label = position, angle = 45), color = "red") +
geom_point(data = subset(pten_positions, cosmic_count > 7 & (saltbridge_unique_partners >= 1 | schbond_unique_partners >= 1) & abundance_class == "intolerant"), aes(x = cosmic_count, y = 10), color = "red", size = 0.5) +
xlab("Number of mutations in COSMIC") + ylab("Number of residues") +
annotate("text", x = 32, y = 30, label = "Not shown:\nPosition 173: 93\nPosition 130: 355", angle = 90, hjust = 0)
ggsave(file = "Output/PTEN_cosmic_counts.pdf", PTEN_all_cosmic_counts, height = 3, width = 3.5)
PTEN_all_cosmic_counts
```
```{r Paste the low abundance saltbridge positions in COSMIC, echo = FALSE}
paste("The number of mutations in COSMIC for position 24 and 26 are",toString(subset(pten_positions, position %in% c(24, 26))$cosmic_count))
paste("The number of mutations in COSMIC for positions 66, 68, 107, 111, 115 are",toString(subset(pten_positions, position %in% c(66,68,107,111,115))$cosmic_count))
paste("The number of mutations in COSMIC for position 153 and 172 are",toString(subset(pten_positions, position %in% c(153, 172))$cosmic_count))
paste("The number of mutations in COSMIC for position 170 and 329 are",toString(subset(pten_positions, position %in% c(170, 329))$cosmic_count))
paste("The number of mutations in COSMIC for positions 177, 252, 276, 277 are",toString(subset(pten_positions, position %in% c(177,252,276,277))$cosmic_count))
paste("The number of mutations in COSMIC for position 179 and 184 are",toString(subset(pten_positions, position %in% c(179, 184))$cosmic_count))
paste("The number of mutations in COSMIC for positions 254, 256, and 269 are",toString(subset(pten_positions, position %in% c(254,256,269))$cosmic_count))
paste("The number of mutations in COSMIC for positions 322, 331, and 334 are",toString(subset(pten_positions, position %in% c(322,331,334))$cosmic_count))
```
This chunk of code makes a .pml file for displaying these potentially critical h-bonds and salt bridges in Pymol, using the same grouping and color scheme used in the above plots.
```{r PTEN making a pml file for the Hbonds and Salt Bridges}
Pymol_hbonds_saltbridges_vector <- c()
Pymol_hbonds_saltbridges_vector <- append(Pymol_hbonds_saltbridges_vector, c(paste("create sc_hbonds_unstable, resi ",gsub(", ", "+",toString(subset(pten_positions, schbond_unique_partners >= 1 & abundance_class == "intolerant")$position))," and not name c+n+o", sep = ""), "show sticks, sc_hbonds_unstable", "show surface, sc_hbonds_unstable", "set transparency, 0.2, sc_hbonds_unstable"))
Pymol_hbonds_saltbridges_vector <- append(Pymol_hbonds_saltbridges_vector, c(paste("create saltbridge_unstable, resi ",gsub(", ","+",toString(subset(pten_positions, saltbridge_unique_partners >= 1 & abundance_class == "intolerant")$position))," and not name c+n+o", sep = ""), "show sticks, saltbridge_unstable", "show surface, saltbridge_unstable", "set transparency, 0.2, saltbridge_unstable"))
group_color_vector <- c("salmon","brown","limegreen","violet","orange","yellow","hotpink","forest")
color_group_commands <- c()
for(x in 1:max(ddata_labels$color_group)){
color_group_commands <- append(color_group_commands, paste("select group", x, ", (sc_hbonds_unstable or saltbridge_unstable) and resi ",gsub(", ","+",toString(ddata_labels[ddata_labels$color_group == x,"label"])), " and not name c+n+o; color ",group_color_vector[x],", group", x, sep=""))
}
fileConn<-file("Output/Pymol_2_PTEN_Hbonds_saltbridges.pml")
writeLines(c(pymol_setup,pymol_color_setup_vector,pymol_color_score_vector,Pymol_hbonds_saltbridges_vector,pymol_pten_standard_view,color_group_commands
,paste("png ",markdown_directory,"/Output/Pymol_2_PTEN_Hbonds_saltbridges.png, width=10cm, dpi=300, ray=1",sep="")
), fileConn)
close(fileConn)
```
#### Figuring out which PTEN positions have high scores
We identified positions in PTEN that had mean abundance scores higher than WT. Some of these enhanced-abundance positions were in structurally resolved regions, and many of them were within 7 Å of known phospholipid-binding positions. In comparison, far fewer of all structurally resolved PTEN positions were within 7 Å of phospholipid-binding positions (Supplementary Fig. 4e). Thus, positions with abundance-enhancing variants tended to be near the membrane-proximal face of PTEN, and included those important for binding PIP3, PIP2 or PI(3)P.
```{r High score analysis, warning = FALSE}
pten_high_score_positions <- subset(pten_positions, abundance_class == "enhanced")
paste("There are",nrow(pten_high_score_positions),"positions in PTEN that had mean abundance scores higher than WT")
pten_high_score_positions_structure <- subset(pten_high_score_positions, !is.na(xca))
pten_high_score_positions_nostructure <- subset(pten_high_score_positions, is.na(xca))
pten_membrane_positions <- c(14,47,130,seq(260,268))
pten_high_score_positions_temp_matrix <- data.frame(matrix(NA, nrow = nrow(pten_high_score_positions_structure), ncol = length(pten_membrane_positions)))
for(row_counter in 1:nrow(pten_high_score_positions_structure)){
for(col_counter in 1:length(pten_membrane_positions)){
#print(paste(row_counter,col_counter))
pten_high_score_positions_temp_matrix[row_counter,col_counter] <- sqrt((subset(pten_positions, position == pten_membrane_positions[col_counter])$xca - pten_high_score_positions_structure[row_counter,"xca"])^2 +
(subset(pten_positions, position == pten_membrane_positions[col_counter])$yca - pten_high_score_positions_structure[row_counter,"yca"])^2 +
(subset(pten_positions, position == pten_membrane_positions[col_counter])$zca - pten_high_score_positions_structure[row_counter,"zca"])^2)
}
}
pten_high_score_positions_temp_matrix$row_min <- apply(pten_high_score_positions_temp_matrix,1,min)