diversity and co_occurrence analysis.Rmd

---
title: "Analysis of serotype co-occurrence and diversity"
author: "Dan Weinberger"
date: "September 19, 2019"
output: 
  html_document:
    self_contained: no
---

```{r setup, include=FALSE}
#knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(dev="tiff", 
               dev.args=list(type="cairo"),
               dpi=300)
library(reshape2)
library(readr)
library(MASS)
library(rio)
library(readxl)
library(htmlTable)
library(arules)
library(arulesViz)
library(gridExtra)
library(ggplot2)
library(abind)
library(ggrepel)
```

##Read in main data
Clonal cluster
```{r}
cc1<-read.csv('./Data/serotype_cluster_mat.csv')
names(cc1)[1]<-'st'

```

MLST
```{r}
mlst1<-read.csv('./Data/serotype_mlst_mat.csv')
names(mlst1)[1]<-'st'
```

GPS
```{r}
gps1<-read_excel('./Data/gladstone gps.xlsx',guess_max=10000)
gps1<-gps1[,c('In_Silico_Serotype', 'GPSC')]
names(gps1)<-c('st','SC')
gps1$count<-1
gps1$st[gps1$st=='15BC']<-'15B/C'
gps1.m<-melt(gps1, id.vars=c('st','SC'))
gps1.c<- dcast(gps1.m,st~SC,fun.aggregate = sum )
```

Phylogenetic tree based groupings from GPSC. These data are downloaded and processed in the file microreact_data.Rmd. We evaluate 3 different cut points for grouping on the phylogenetic tree; at 100, 200 and 300. 100 results in more groups, and is more stringest, 300 is a broader grouping and less stringent (but still more stringent than GPSC)
```{r}
phylo100<-read.csv('phylo.grps100.csv')
phylo100<-phylo100[,c('In_Silico_Serotype','phylo.grp' )]
names(phylo100)<-c('st','PG')
phylo100$st<-as.character(phylo100$st)
phylo100$count=1
phylo100$st[phylo100$st=='15BC']<-'15B/C'
phylo100.m<-melt(phylo100, id.vars=c('st','PG'))
phylo100.c<- dcast(phylo100.m,st~PG,fun.aggregate = sum )

phylo200<-read.csv('phylo.grps200.csv')
phylo200<-phylo200[,c('In_Silico_Serotype','phylo.grp' )]
names(phylo200)<-c('st','PG')
phylo200$st<-as.character(phylo200$st)
phylo200$count=1
phylo200$st[phylo200$st=='15BC']<-'15B/C'
phylo200.m<-melt(phylo200, id.vars=c('st','PG'))
phylo200.c<- dcast(phylo200.m,st~PG,fun.aggregate = sum )

phylo300<-read.csv('phylo.grps300.csv')
phylo300<-phylo300[,c('In_Silico_Serotype','phylo.grp' )]
names(phylo300)<-c('st','PG')
phylo300$st<-as.character(phylo300$st)
phylo300$count=1
phylo300$st[phylo300$st=='15BC']<-'15B/C'
phylo300.m<-melt(phylo300, id.vars=c('st','PG'))
phylo300.c<- dcast(phylo300.m,st~PG,fun.aggregate = sum )
```


##Read in auxillary data
```{r cars}
carr1<-import('https://raw.githubusercontent.com/weinbergerlab/GrowthVariation/master/Data/carr%20data.csv')
carr1$ST[carr1$ST %in% c('15B','15C')]<-'15B/C'
carr1<-aggregate(carr1[,-1], by=list('ST'=carr1$ST), FUN=sum)

ps1<-import('https://raw.githubusercontent.com/weinbergerlab/GrowthVariation/master/Data/PS%20Composition_SS_final.csv')
ps1$Ribitol<-ps1$`Rib-ol`
ps1$Serotype[ps1$Serotype=='15B']<-'15B/C'
ps1$any.nac<-apply(ps1[,c('FucNAc','GalNAc', 'ManNAc', 'ManNAcA','PneNAc', 'GlcNAc')], 1,sum)
ps1$any.nac<-(ps1$any.nac>0)*1
```

##Co occurrence analysis
Generate co-occurrence matrix. Count how often StA and StB occur on same CC/MLST/GPSC
```{r}
co.occur.func<-function(ds){
    st.mlstN.alt<- cbind.data.frame('st'=as.character(ds[,1]),  'mlstN'=apply(ds[,-1],1,function(x) sum(x>0)))
    
    #Co-occurrence matrix
    cc1.m<-reshape2::melt(ds, id.vars = 'st')
    names(cc1.m)<-c('st','mlst','n.isolate')
    N.mlst<-length(unique(cc1.m$mlst))
    cc1.spl<-split(cc1.m, cc1.m$mlst)
    cc1.spl<-lapply(cc1.spl, function(x) x<-x[x$n.isolate>0,])
    obs.pairs<-lapply(cc1.spl, function(x) if(nrow(x)>=2){matrix(as.character(t(combn(x$st, m=2))), ncol=2)}  ) 
    #combinat
    obs.pairs<-do.call('rbind',obs.pairs )
    obs.pairs<-as.data.frame(obs.pairs)
    obs.pairs$count<-1
    names(obs.pairs)<-c('stA','stB','count')
    co.occur<-aggregate(obs.pairs[,'count', drop=F], by=list('stA'=obs.pairs$stA, 'stB'=obs.pairs$stB), FUN=sum)
    #############
    ##Ensures we have a symmetric matrix
    co.occur.flip<-co.occur
    names(co.occur.flip)<-c('stB', 'stA','count' )
    co.occur<-rbind.data.frame(co.occur,co.occur.flip) 
    #############
    co.occur.m<-melt(co.occur, id.vars=c('stA', 'stB'))
    co.occur.c1<-dcast(co.occur.m, stB~stA)
    co.occur.cc<-melt(co.occur.c1, id.vars=c('stB'))
    names(co.occur.cc)<- c('Sero1', 'Sero2','ccN' )
    co.occur.cc$Sero1<-as.character(co.occur.cc$Sero1)
    co.occur.cc$Sero2<-as.character(co.occur.cc$Sero2)
    co.occur.cc$ccN[is.na(co.occur.cc$ccN)]<-0
    
    cc2<-merge(co.occur.cc,st.mlstN.alt,by.x='Sero1', by.y='st' , all=T)
    cc2<-merge(cc2,st.mlstN.alt,by.x='Sero2', by.y='st' )
    
    cc2$st.mlstN.combined<-cc2$mlstN.x/N.mlst * cc2$mlstN.y/N.mlst
    box.cox.func<-function(x, lambda){
      y<- (x^lambda-1)/lambda
    }
    cc2$log.st.mlstn.combined<- box.cox.func(cc2$st.mlstN.combined, lambda=0.01)
    hist(cc2$log.st.mlstn.combined)
    mean(cc2$log.st.mlstn.combined)
    median(cc2$log.st.mlstn.combined)
    range(cc2$log.st.mlstn.combined)
    
    cc2$sgA<-as.numeric(gsub("([0-9]+).*$", "\\1", cc2$Sero1))
    cc2$sgB<-as.numeric(gsub("([0-9]+).*$", "\\1", cc2$Sero2))
    cc2$same.sg<- as.numeric((cc2$sgA==cc2$sgB))
    plot(cc2$log.st.mlstn.combined, log(cc2$ccN+0.5))

    cc3<-merge(cc2,ps1, by.x='Sero1',by.y='Serotype')
    ds2<-merge(cc3,ps1, by.x='Sero2',by.y='Serotype')
    ds2$both.glcA<- as.factor(ds2$GlcA.x +  ds2$GlcA.y )
    ds2$both.glcA<-relevel(ds2$both.glcA,ref='1')
    ds2$both.gal<- as.factor(ds2$Gal.x + ds2$Gal.y)
    ds2$both.gal<-relevel(ds2$both.gal,ref='1')
    ds2$both.glcnac<- as.factor(ds2$GlcNAc.x + ds2$GlcNAc.y)
    ds2$both.glc<- as.factor(ds2$Glc.x + ds2$Glc.y)
    ds2$both.glc<-relevel(ds2$both.glc,ref='1')
    ds2$both.ac<- as.factor(ds2$Ac.y + ds2$Ac.y)
    ds2$both.rha<- as.factor(ds2$Rha.x + ds2$Rha.y)
    ds2$both.gro<- as.factor(ds2$Gro.x + ds2$Gro.y)
    ds2$both.ribol<- as.factor(ds2$`Rib-ol.x` + ds2$`Rib-ol.y`)
    ds2$both.cho<- as.factor(ds2$Cho.x + ds2$Cho.y)
    ds2$both.fucnac<- as.factor(ds2$FucNAc.x + ds2$FucNAc.y)
    ds2$both.mannac<- as.factor(ds2$ManNAc.x + ds2$ManNAc.y)
    ds2$NAC.x.N<- apply(ds2[,c('FucNAc.x','GalNAc.x', 'ManNAc.x', 'ManNAcA.x','PneNAc.x', 'GlcNAc.x')], 1,sum)
    ds2$NAC.y.N<- apply(ds2[,c('FucNAc.y','GalNAc.y', 'ManNAc.y', 'ManNAcA.y','PneNAc.y', 'GlcNAc.y')], 1,sum)
    ds2$both.NAC<-as.factor( (ds2$NAC.x.N>0) + (ds2$NAC.y.N>0))
    ds2$both.NAC<-relevel(ds2$both.NAC,ref='1')
    
    ds2$pair<-as.factor(paste(ds2$Sero1, ds2$Sero2, sep='_'))
    
    ds2$share_mlst_bin<-ds2$ccN
    ds2$share_mlst_bin[ds2$share_mlst_bin>1]<-1
return(ds2)
}
```

Format the data for regression
```{r}
mlst.co<-co.occur.func(mlst1)
cc.co<-co.occur.func(cc1)
gps.co<-co.occur.func(gps1.c)
phylo100.co<-co.occur.func(phylo100.c)
phylo200.co<-co.occur.func(phylo200.c)
phylo300.co<-co.occur.func(phylo300.c)

write.csv(mlst.co[,c('Sero1','Sero2','ccN','mlstN.x','mlstN.y','same.sg','both.glcA','both.gal','both.glc','both.glcnac','both.ac','both.rha','both.gro','both.ribol','both.cho','both.fucnac','both.mannac','both.NAC' )],'co.occurrence.mlst.csv')
write.csv(cc.co[,c('Sero1','Sero2','ccN','mlstN.x','mlstN.y','same.sg','both.glcA','both.gal','both.glc','both.glcnac','both.ac','both.rha','both.gro','both.ribol','both.cho','both.fucnac','both.mannac','both.NAC' )],'co.occurrence.cc.csv')
write.csv(gps.co[,c('Sero1','Sero2','ccN','mlstN.x','mlstN.y','same.sg','both.glcA','both.gal','both.glc','both.glcnac','both.ac','both.rha','both.gro','both.ribol','both.cho','both.fucnac','both.mannac','both.NAC' )],'co.occurrence.gps.csv')

```

```{r}
reg.func2<-function(ds2){
  sug.vars<-c('both.glcA','both.NAC','both.rha', 'both.gal', 'both.glcnac','both.gro','both.fucnac','both.ribol','both.glc','both.mannac')
  poisson.list<-  vector("list", length(sug.vars))
  binomial.list<-  vector("list", length(sug.vars))
  for(i in sug.vars){
    form1<-as.formula(paste0('ccN ~  log.st.mlstn.combined +',i))
    m1 <- glm(form1  ,  data=ds2[ds2[,i] %in% c('1','2') & ds2$same.sg==0,], family='quasipoisson')
   # print(i)
    #print(summary(m1)$coefficients[3,])
    poisson.list[[i]]<-c('estimate'=summary(m1)$coefficients[3,'Estimate'], confint(m1)[3,])
  }
  
  #Dichotomous instead
  for(i in sug.vars){
    form1<-as.formula(paste0('share_mlst_bin ~  log.st.mlstn.combined +',i))
    m1 <- glm(form1  ,  data=ds2[ds2[,i] %in% c('1','2') & ds2$same.sg==0,], family='binomial')
    #print(i)
    #print(summary(m1)$coefficients[paste0(i,2),])
    binomial.list[[i]]<-c('estimate'=summary(m1)$coefficients[3,'Estimate'], confint(m1)[3,])
  }
  
  binomial.res<-do.call('rbind', binomial.list)
  binomial.res<-round(binomial.res, 2)
  binomial.res.format<-apply(binomial.res,1, function(x) paste0(x['estimate'] ," (",x['2.5 %'], ',',x['97.5 %']  ,")") )
  
  poisson.res<-do.call('rbind', poisson.list)
  poisson.res<-round(poisson.res, 2)
  poisson.res.format<-apply(poisson.res,1, function(x) paste0(x['estimate'] ," (",x['2.5 %'], ',',x['97.5 %']  ,")") )
  
  out.tab <- cbind.data.frame(poisson.res.format, binomial.res.format)
  names(out.tab)<-c('Poisson', 'Logistic')
  outlist<-list('out.tab.formatted'=out.tab,'binomial.res'=binomial.res,'poisson.res'=poisson.res )
  return(outlist)
}
```

```{r}
mlst.mod<-reg.func2(mlst.co)
mlst.tab<-mlst.mod$out.tab.formatted
names(mlst.tab)<-c('Poisson MLST', 'Logistic MLST')

cc.mod<-reg.func2(cc.co)
cc.tab<-cc.mod$out.tab.formatted
names(cc.tab)<-c('Poisson CC', 'Logistic CC')

gps.mod<-reg.func2(gps.co)
gps.tab<-gps.mod$out.tab.formatted
names(gps.tab)<-c('Poisson GPSC', 'Logistic GPSC')

```
Model with phylogenetic groups. Model for data with cut at 100 doesn't run 
```{r}
# phylo100.mod<-reg.func2(phylo100.co)
# phylo100.tab<-phylo.mod$out.tab.formatted
# names(phylo100.tab)<-c('Poisson Phylo100', 'Logistic Phylo100')
# phylo100.tab

phylo200.mod<-reg.func2(phylo200.co)
phylo200.tab<-phylo200.mod$out.tab.formatted
names(phylo200.tab)<-c('Poisson Phylo200', 'Logistic Phylo200')
phylo200.tab

phylo300.mod<-reg.func2(phylo300.co)
phylo300.tab<-phylo300.mod$out.tab.formatted
names(phylo300.tab)<-c('Poisson Phylo300', 'Logistic Phylo300')
phylo300.tab
```


```{r}
combined.tab<-cbind( mlst.tab,cc.tab,gps.tab)
combined.tab$sugar<-NULL
combined.tab$X<-NULL
combined.tab<-combined.tab[c('both.glc','both.glcA','both.NAC','both.rha','both.gal','both.glcnac','both.gro','both.fucnac',
                             'both.ribol','both.mannac'), ]
row.names(combined.tab)<-c('Glc','GlcA','Any NAc', 'Rha','Gal','GlcNAc','Gro','FucNAc', 'Ribitol','ManNAc')
table1<-htmlTable(combined.tab, align='l',caption="Table 1. Association between the presence of specific sugars in the capsule and co-occurrence of serotype pairs on the same genetic background (MLST, CC, or GPSC). ")
print(table1,type="html",useViewer=TRUE)
```
also add in phylogenetic data
```{r}
combined.tab.phylo<-cbind( mlst.tab,cc.tab,gps.tab, phylo200.tab,phylo300.tab )
combined.tab.phylo$sugar<-NULL
combined.tab.phylo$X<-NULL
combined.tab.phylo<-combined.tab.phylo[c('both.glc','both.glcA','both.NAC','both.rha','both.gal','both.glcnac','both.gro','both.fucnac',
                             'both.ribol','both.mannac'), ]
row.names(combined.tab.phylo)<-c('Glc','GlcA','Any NAc', 'Rha','Gal','GlcNAc','Gro','FucNAc', 'Rib-ol','ManNAc')
table.phylo1<-htmlTable(combined.tab.phylo, align='l',caption="Table 1. Association between the presence of specific sugars in the capsule and co-occurrence of serotype pairs on the same genetic background (MLST, CC, or GPSC). ")
print(table.phylo1,type="html",useViewer=TRUE)
```

Figure of table 1 (Figure 2)
```{r fig2, fig.width=8, fig.height=4}
combined.array.bin<- abind(mlst.mod$binomial.res,cc.mod$binomial.res, gps.mod$binomial.res, along=3 )
combined.array.pois<- abind(mlst.mod$poisson.res,cc.mod$poisson.res, gps.mod$poisson.res, along=3 )
combined.array<-abind(combined.array.bin,combined.array.pois, along=4)
dimnames(combined.array)[[3]]<-c('mlst','cc','gps')
dimnames(combined.array)[[4]]<-c('binomial','poisson')
combined.array<-combined.array[c('both.glc','both.glcA','both.NAC','both.rha','both.gal','both.glcnac','both.gro','both.fucnac',
                             'both.ribol','both.mannac'),,,]
 models<-c('binomial', 'poisson')
  model.labs<-c('Logistic Regression', 'Poisson Regression')
par(mfrow=c(1,2))
for(i in 1:2){
 
plot.array<-combined.array[,,,models[i]]
plotrange<-range(plot.array[,c("estimate", "2.5 %","97.5 %"),])
plotrange[plotrange>3]<-3
plot(plot.array[1:dim(plot.array)[1],'estimate','cc'], pch=16, col='#e41a1c', bty='l', ylim=c(-1,2), ylab=paste0(model.labs[i],' Coefficient'), xaxt='n', xlab='', xlim=c(0.9,dim(plot.array)[1]+0.55))
points(((1:dim(plot.array)[1]+0.25)),plot.array[,'estimate','mlst'], pch=17, col='#377eb8')
points(((1:dim(plot.array)[1]+0.5)),plot.array[,'estimate','gps'], pch=18, col='#4daf4a')

arrows(x0=1:dim(plot.array)[1],y0=plot.array[,"2.5 %",'cc'],y1=plot.array[,"97.5 %",'cc'], pch=16, col='#e41a1c', length=0)
arrows(x0=((1:dim(plot.array)[1]+0.25)),y0=plot.array[,"2.5 %",'mlst'],y1=plot.array[,"97.5 %",'mlst'], pch=16, col='#377eb8', length=0)
arrows(x0=((1:dim(plot.array)[1]+0.5)),y0=plot.array[,"2.5 %",'gps'],y1=plot.array[,"97.5 %",'gps'], pch=16, col='#4daf4a', length=0)

abline(h=0, lty=2, col='gray')
text(1:10, y=-1, c('Glc','GlcA','Any NAc', 'Rha','Gal','GlcNAc','Gro','FucNAc', 'Ribitol','ManNAc'), pos=4, offset=0, xpd=NA, col='gray', cex=0.75, srt=90)
legend(1.1,2, legend=c("CC", "MLST", "GPSC"),
       col=c("#e41a1c", "#377eb8", "#4daf4a"), pch=c(16,17,18), box.lty=0, adj=c(0.1, 0.5))
}
```

Figure of table 1 +phylo data 
```{r, fig.width=8, fig.height=4}
combined.array.bin<- abind(mlst.mod$binomial.res,cc.mod$binomial.res, gps.mod$binomial.res ,phylo200.mod$binomial.res,phylo300.mod$binomial.res , along=3 )
combined.array.pois<- abind(mlst.mod$poisson.res,cc.mod$poisson.res, gps.mod$poisson.res,phylo200.mod$poisson.res,phylo300.mod$poisson.res, along=3 )
combined.array<-abind(combined.array.bin,combined.array.pois, along=4)
dimnames(combined.array)[[3]]<-c('mlst','cc','gps', 'phylo200', 'phylo300')
dimnames(combined.array)[[4]]<-c('binomial','poisson')
combined.array<-combined.array[c('both.glc','both.glcA','both.NAC','both.rha','both.gal','both.glcnac','both.gro','both.fucnac',
                             'both.ribol','both.mannac'),,,]
 models<-c('binomial', 'poisson')
  model.labs<-c('Logistic Regression', 'Poisson Regression')
par(mfrow=c(1,2))
for(i in 1:2){
 
plot.array<-combined.array[,,,models[i]]
plotrange<-range(plot.array[,c("estimate", "2.5 %","97.5 %"),])
plotrange[plotrange>3]<-3
plot(plot.array[1:dim(plot.array)[1],'estimate','cc'], pch=16, col='#e41a1c', bty='l', ylim=c(-1,3), ylab=paste0(model.labs[i],' Coefficient'), xaxt='n', xlab='', xlim=c(0.9,dim(plot.array)[1]+0.55))
points(((1:dim(plot.array)[1]+0.1)),plot.array[,'estimate','mlst'], pch=17, col='#377eb8')
points(((1:dim(plot.array)[1]+0.2)),plot.array[,'estimate','gps'], pch=18, col='#4daf4a')
points(((1:dim(plot.array)[1]+0.4)),plot.array[,'estimate','phylo200'], pch=18, col='#ff7f00')
points(((1:dim(plot.array)[1]+0.3)),plot.array[,'estimate','phylo300'], pch=18, col='#a65628')


arrows(x0=1:dim(plot.array)[1],y0=plot.array[,"2.5 %",'cc'],y1=plot.array[,"97.5 %",'cc'], pch=16, col='#e41a1c', length=0)
arrows(x0=((1:dim(plot.array)[1]+0.1)),y0=plot.array[,"2.5 %",'mlst'],y1=plot.array[,"97.5 %",'mlst'], pch=16, col='#377eb8', length=0)
arrows(x0=((1:dim(plot.array)[1]+0.2)),y0=plot.array[,"2.5 %",'gps'],y1=plot.array[,"97.5 %",'gps'], pch=16, col='#4daf4a', length=0)
arrows(x0=((1:dim(plot.array)[1]+0.4)),y0=plot.array[,"2.5 %",'phylo200'],y1=plot.array[,"97.5 %",'phylo200'], pch=16, col='#ff7f00', length=0)
arrows(x0=((1:dim(plot.array)[1]+0.3)),y0=plot.array[,"2.5 %",'phylo300'],y1=plot.array[,"97.5 %",'phylo300'], pch=16, col='#a65628', length=0)


abline(h=0, lty=2, col='gray')
text(1:10, y=-1, c('Glc','GlcA','Any NAc', 'Rha','Gal','GlcNAc','Gro','FucNAc', 'Rib-ol','ManNAc'), pos=4, offset=0, xpd=NA, col='gray', cex=0.75, srt=90)
legend(1.1,3, legend=c("CC", "MLST", "GPSC", 'Phylo300', 'Phylo200'),
       col=c("#e41a1c", "#377eb8", "#4daf4a","#a65628","#ff7f00"), pch=c(16,17,18, 18,18), box.lty=0, adj=c(0.1, 0.5))
}
```


### Market basket analysis
```{r}
mb.func<-function(ds1){
    sero_st<-ds1
    #Assign serotypes as row names
    rownames(sero_st) <- sero_st$st
    #Remove first column containing serotypes
    sero_st <- sero_st[,-1]
    
    #Remove X from column names
    colnames(sero_st) <- substring(colnames(sero_st),2)
    #Convert the matrix to binary
    sero_st[sero_st != 0] <- 1 
    
    #Transpose data
    sero_st <- data.frame(t(sero_st))
    #Remove X from column names
    colnames(sero_st) <- substring(colnames(sero_st),2)
    
    #Change the column name for 15B.C to 15B/C
    colnames(sero_st)[which(colnames(sero_st) == "15B.C")] <- "15B/C"
    colnames(sero_st)[which(colnames(sero_st) == "7B.C")] <- "7B/C"
    colnames(sero_st)[which(colnames(sero_st) == "6C.D")] <- "6C/D"
    #sero_st <- sero_st[which(rowSums(sero_st) != 0),]
    #sero_st <- sero_st[-which(rowSums(sero_st) == 1),]
    
    #Convert dataset to transcation data type for market basket analysis
    ps_mat <- as.matrix(sero_st)
    ps_tran <- as(ps_mat,"transactions")
   # summary(ps_tran)
    
    #The plot shows the frequency of serotypes
   # itemFrequencyPlot(ps_tran, topN = 25, type='absolute')
  #  itemFrequency(ps_tran, type='absolute')
    
    #Since we are interested in the pairwise interactions between serotypes,
    #rules with length of two will be generated to identify patterns of serotype co-occurence
    rules_2 <- apriori(ps_tran,parameter = list(supp = 0.0001, conf = 0.0001,minlen=2, maxlen=2))
   # summary(rules_2)
    #Data frame for rules with 2 serotypes sorted by support
    rules_2sero <- inspect(sort(rules_2, by ="lift"))
    
    #Default plot of rules_2 from arules
    #plot(rules_2, method = "graph")
    
    #filter 
    rule.inc <- subset(rules_2, subset =  lift < 1 & rules_2@quality$count>1)
    #plot(rule.inc, method = "graph")
    rule.inc.sig<-interestMeasure(rule.inc, measure = c("chiSquared","fishersExactTest"), transactions = ps_tran)
    rule.inc2<-cbind(inspect(rule.inc),rule.inc.sig)
    #rule.inc2[rule.inc2$fishersExactTest<0.05,]
    
    #The following code adds column for p-values from Fishers Exact test
    #Make a dataframe with quality values for rules
    r2_qual <- quality(rules_2)
    #Include interest measures
    r2_qual <- cbind(inspect(rules_2), interestMeasure(rules_2, measure = c("chiSquared","fishersExactTest"), transactions = ps_tran))
    #r2_qual_sig <- r2_qual[which(r2_qual$count > 9),]
    
    #Drop duplicates from final results in r2_qual
    #Function to remove duplicated combinations occuring on even number of rows
    rem_dup <- function(comb_df){
      i <- 1:nrow(comb_df)
      j <- which(i %% 2 == 0)
      comb_df <- comb_df[j,]
    }
    
    #Remove duplicate rules
    rules_all_2sero <- rem_dup(r2_qual)
    rules_sig_2sero <- rules_all_2sero[rules_all_2sero$count > 2,]
    
    rules_sig_2sero<-rules_sig_2sero[order(-rules_sig_2sero$lift) ,]
    rules_sig_2sero<-rules_sig_2sero[rules_sig_2sero$fishersExactTest<0.05 & rules_sig_2sero$count>3 & rules_sig_2sero$lift<1 ,]
   # rules_sig_2sero[rules_sig_2sero$count>3 & rules_sig_2sero$lift>1,]
    res<-list('rules_2sero'=rules_2sero, 'rules_sig_2sero'=rules_sig_2sero )
    return(res)
}
```

Run the Market Basket analysis
```{r}
cc1.mb<-mb.func(cc1)
mlst1.mb<-mb.func(mlst1)
gps1.mb<- mb.func(gps1.c)
```
Significant rules, ordered by lift
```{r}
gps1.mb$rules_sig_2sero
```
```{r}
cc1.mb$rules_sig_2sero
```

## Diversity analysis
Diversity analysis
```{r}
sugs<-c("Ac","FucNAc" ,'Gal',"GalNAc",'Glc', "GlcA","GlcNAc",   "Gro","ManNAc",'Rha','any.nac','Ribitol')
diversity.func<-function(ds){
  st.cc.diversity<- 1 -apply(ds[,-1], 1, function(x) sum((x/sum(x))^2) )
  st.cc.sum<- apply(ds[,-1], 1, function(x) sum(x ))
  st.cc.diversity<-cbind.data.frame('st'=ds[,1], st.cc.diversity,st.cc.sum)
  st.cc.diversity<-st.cc.diversity[st.cc.sum>10,] #drop if <10 isolates in  database
st.cc.diversity<-st.cc.diversity[order(st.cc.diversity[,'st.cc.diversity']),]
  return(st.cc.diversity)
}
#Function to evauate correlations
diversity.cor.func<-function(ds, sug.list){
  st.cc.diversity<-merge(ds, carr1, by.x='st', by.y='ST', all=T)
  st.cc.diversity<-merge(st.cc.diversity, ps1, by.x='st', by.y='Serotype', all=T)
  st.cc.diversity$SLEEMANCARRN[is.na(st.cc.diversity$SLEEMANCARRN)]<-0
  st.cc.diversity$NORWAYCARR_PRE[is.na(st.cc.diversity$NORWAYCARR_PRE)]<-0
  st.cc.diversity<-st.cc.diversity[!is.na(st.cc.diversity$st.cc.diversity),]
  st.cc.diversity<-st.cc.diversity[!is.na(st.cc.diversity$Glc),]
  st.cc.diversity$st.cc.diversity[st.cc.diversity$st.cc.diversity==0]<-0.01
  st.cc.diversity$st.cc.diversity[st.cc.diversity$st.cc.diversity==1]<-0.99
  logit.diversity<- log(st.cc.diversity$st.cc.diversity/(1-st.cc.diversity$st.cc.diversity))
 # plot(logit.diversity, log(st.cc.diversity$st.cc.sum))
form1<-as.formula(paste0('logit.diversity~ sqrt(SLEEMANCARRN) +', sug.list)    )
  mod1<-lm(form1, data=st.cc.diversity)
  summary(mod1)$df[2]
  sug.est<-c('estimate'=round(summary(mod1)$coefficients[sug.list,'Estimate'],2), round(confint(mod1)[sug.list,],2))
  AIC1<-round(AIC(mod1),1)
  df<-  summary(mod1)$df[2]
  sdi.result<-c('AIC'=AIC1, sug.est,'df'=df)
  return(sdi.result)
}
```

Diversity by CC

```{r}
cc.sdi<-diversity.func(cc1)
cc.div<-t(sapply(sugs, diversity.cor.func, ds=cc.sdi))
cc.div.sort<-cc.div[order(cc.div[,1]),]
cc.div.sort
cc.sdi.format<- cbind.data.frame('Sugar'=row.names(cc.div),'Coefficient (CC)'=paste0(cc.div[,'estimate'],', (',cc.div[,'2.5 %'],',',cc.div[,'97.5 %'],')'), 'AIC'=cc.div[,'AIC']  )
```

Diversity by MLST
```{r}
mlst.sdi<-diversity.func(mlst1)
mlst.div<-t(sapply(sugs, diversity.cor.func, ds=mlst.sdi))
mlst.div.sort<-mlst.div[order(mlst.div[,1]),]
mlst.div.sort
mlst.sdi.format<- cbind.data.frame('Sugar'=row.names(mlst.div),'Coefficient (MLST)'=paste0(mlst.div[,'estimate'],', (',mlst.div[,'2.5 %'],',',mlst.div[,'97.5 %'],')'))
```

Diversity by GPSC
```{r}
GPSC.sdi<-diversity.func(gps1.c)
GPSC.div<-t(sapply(sugs, diversity.cor.func, ds=GPSC.sdi))
GPSC.div.sort<-GPSC.div[order(GPSC.div[,1]),]
GPSC.div.sort
GPSC.sdi.format<- cbind.data.frame('Sugar'=row.names(GPSC.div),'Coefficient (GPSC)'=paste0(GPSC.div[,'estimate'],', (',GPSC.div[,'2.5 %'],',',GPSC.div[,'97.5 %'],')'))

```

Diversity by Phylo
```{r}
phylo.sdi100<-diversity.func(phylo100.c)
phylo.div100<-t(sapply(sugs, diversity.cor.func, ds=phylo.sdi100))
phylo.div.sort100<-phylo.div100[order(phylo.div100[,1]),]
phylo.div.sort100
phylo.sdi.format100<- cbind.data.frame('Sugar'=row.names(phylo.div100),'Coefficient (phylo100)'=paste0(phylo.div100[,'estimate'],', (',phylo.div100[,'2.5 %'],',',phylo.div100[,'97.5 %'],')'))

phylo.sdi200<-diversity.func(phylo200.c)
phylo.div200<-t(sapply(sugs, diversity.cor.func, ds=phylo.sdi200))
phylo.div.sort200<-phylo.div200[order(phylo.div200[,1]),]
phylo.div.sort200
phylo.sdi.format200<- cbind.data.frame('Sugar'=row.names(phylo.div200),'Coefficient (phylo200)'=paste0(phylo.div200[,'estimate'],', (',phylo.div200[,'2.5 %'],',',phylo.div200[,'97.5 %'],')'))

phylo.sdi300<-diversity.func(phylo300.c)
phylo.div300<-t(sapply(sugs, diversity.cor.func, ds=phylo.sdi300))
phylo.div.sort300<-phylo.div300[order(phylo.div300[,1]),]
phylo.div.sort300
phylo.sdi.format300<- cbind.data.frame('Sugar'=row.names(phylo.div300),'Coefficient (phylo300)'=paste0(phylo.div300[,'estimate'],', (',phylo.div300[,'2.5 %'],',',phylo.div300[,'97.5 %'],')'))


cbind(phylo.sdi.format100,phylo.sdi.format200[,2],phylo.sdi.format300[,2])
```


```{r fig3, fig.width=8, fig.height=4}
names(GPSC.sdi)<-c('st','sdi.gps','N.gps')
names(mlst.sdi)<-c('st','sdi.mlst','N.mlst')
names(cc.sdi)<-c('st','sdi.cc','N.cc')
#names(phylo.sdi100)<-c('st','sdi.phylo100','N.phylo100')

all.sdi<- merge(GPSC.sdi, mlst.sdi, by='st', all=T)
all.sdi<-merge(all.sdi, cc.sdi, by='st', all=T)
write.csv(all.sdi,'SDI.by.serotype.csv')

all.sdi<-all.sdi[order(all.sdi$sdi.cc),]
#Plot the CC Diversity

#MLST vs CC
all.sdi$st.common<-all.sdi$st
all.sdi$st.common[all.sdi$N.cc<200] <- NA
#MLST
p1<-ggplot(data=all.sdi, aes(x=sdi.cc, y=sdi.mlst)) +
  geom_text_repel(size=3,aes(label=all.sdi$st.common ), colour='gray70')+
  geom_point(alpha=0.5, colour='black',aes(size=c(all.sdi$N.cc+all.sdi$N.mlst) ))+
  scale_size_area()+
  theme(panel.grid.major = element_blank(),legend.position = "none", panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black"))+
  xlab("Serotype Diversity (CC)")+
  ylab('Serotype Diversity (MLST)')
#GPSC
p2<-ggplot(data=all.sdi, aes(x=sdi.cc, y=sdi.gps)) +
  geom_text_repel(size=3,aes(label=all.sdi$st.common) , colour='gray70')+
  geom_point(alpha=0.5, colour='black',aes(size=c(all.sdi$N.cc+all.sdi$N.gps) ))+
  scale_size_area()+
  theme(panel.grid.major = element_blank(),legend.position = "none", panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black"))+
  xlab("Serotype Diversity (CC)")+
  ylab('Serotype Diversity (GPSC)')
grid.arrange(p1, p2, nrow = 1)
```

Plots with phylo samples. This shows that with a cutoff of 100, everything is super diver, which makes sense because this is a very sensivite definition for splitting groups, resulting in lots of groups, even if closely related.200 and 300 look similar
```{r, fig.width=8, fig.height=3}
names(phylo.sdi100)<-c('st','phylo.div.100','N.div.100')
names(phylo.sdi200)<-c('st','phylo.div.200','N.div.200')
names(phylo.sdi300)<-c('st','phylo.div.300','N.div.300')

all.sdi.phylo<-merge(phylo.sdi100, all.sdi, by='st', all=T)
all.sdi.phylo<-merge(phylo.sdi200, all.sdi.phylo, by='st', all=T)
all.sdi.phylo<-merge(phylo.sdi300, all.sdi.phylo, by='st', all=T)

all.sdi.phylo<-all.sdi.phylo[order(all.sdi.phylo$sdi.cc),]

#phylo100
p1<-ggplot(data=all.sdi.phylo, aes(x=sdi.cc, y=phylo.div.100)) +
  geom_text_repel(size=3,aes(label=all.sdi$st.common ), colour='gray70')+
  geom_point(alpha=0.5, colour='black',aes(size=c(all.sdi$N.cc+all.sdi$N.mlst) ))+
  scale_size_area()+
  theme(panel.grid.major = element_blank(),legend.position = "none", panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black"))+
  xlab("Serotype Diversity (CC)")+
  ylab('Serotype Diversity (Phylo100)')
#phylo200
p2<-ggplot(data=all.sdi.phylo, aes(x=sdi.cc, y=phylo.div.200)) +
  geom_text_repel(size=3,aes(label=all.sdi$st.common ), colour='gray70')+
  geom_point(alpha=0.5, colour='black',aes(size=c(all.sdi$N.cc+all.sdi$N.mlst) ))+
  scale_size_area()+
  theme(panel.grid.major = element_blank(),legend.position = "none", panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black"))+
  xlab("Serotype Diversity (CC)")+
  ylab('Serotype Diversity (Phylo200)')
#Phylo300
p3<-ggplot(data=all.sdi.phylo, aes(x=sdi.cc, y=phylo.div.300)) +
  geom_text_repel(size=3,aes(label=all.sdi$st.common) , colour='gray70')+
  geom_point(alpha=0.5, colour='black',aes(size=c(all.sdi$N.cc+all.sdi$N.gps) ))+
  scale_size_area()+
  theme(panel.grid.major = element_blank(),legend.position = "none", panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black"))+
  xlab("Serotype Diversity (CC)")+
  ylab('Serotype Diversity (Phylo300)')
grid.arrange(p1, p2, p3, nrow = 1)
```


### Summary table of Diversity analysis
```{r}
div.sum1<- merge(cc.sdi.format,mlst.sdi.format, by='Sugar')
div.sum1<- merge(div.sum1,GPSC.sdi.format, by='Sugar')
div.sum1<-div.sum1[order(div.sum1$AIC),]
div.sum1$AIC <- NULL
row.names(div.sum1)<- div.sum1$Sugar
div.sum1$Sugar<-NULL
table.div<-htmlTable(div.sum1, align='l',caption="Table 3. Association between the diversity of a serotype and the presence of specific sugars in the capsule, with diversity based on MLST, CC, or GPSC")
print(table.div,type="html",useViewer=TRUE)
table.div
```
Or a figure of the same
```{r fig4}
reg.array<-abind(cc.div, mlst.div, GPSC.div, along=3)
dimnames(reg.array)[[3]]<-c('cc','mlst','gpsc')
reg.array<-reg.array[order(reg.array[,'AIC','cc']),,]
plot(reg.array[1:dim(reg.array)[1],'estimate','cc'], pch=16, col='#e41a1c', bty='l', ylim=range(reg.array[,c("estimate", "2.5 %","97.5 %"),]), ylab='Coefficient', xaxt='n', xlab='')
points(((1:dim(reg.array)[1]+0.15)),reg.array[,'estimate','mlst'], pch=17, col='#377eb8')
points(((1:dim(reg.array)[1]+0.3)),reg.array[,'estimate','gpsc'], pch=18, col='#4daf4a')

arrows(x0=1:dim(reg.array)[1],y0=reg.array[,"2.5 %",'cc'],y1=reg.array[,"97.5 %",'cc'], pch=16, col='#e41a1c', length=0)
arrows(x0=((1:dim(reg.array)[1]+0.15)),y0=reg.array[,"2.5 %",'mlst'],y1=reg.array[,"97.5 %",'mlst'], pch=16, col='#377eb8', length=0)
arrows(x0=((1:dim(reg.array)[1]+0.3)),y0=reg.array[,"2.5 %",'gpsc'],y1=reg.array[,"97.5 %",'gpsc'], pch=16, col='#4daf4a', length=0)

abline(h=0, lty=2, col='gray')
text(1:12, y=-2.5, dimnames(reg.array)[[1]], pos=4, offset=0, xpd=NA, col='gray', cex=0.75)
legend("topright", legend=c("CC", "MLST", "GPSC"),
       col=c("#e41a1c", "#377eb8", "#4daf4a"), pch=c(16,17,18), box.lty=0)

```
Same, but also include the phylo results
```{r}
reg.array<-abind(cc.div, mlst.div, GPSC.div,phylo.div300,phylo.div200,phylo.div100, along=3)
dimnames(reg.array)[[3]]<-c('cc','mlst','gpsc', 'phylo300', 'phylo200', 'phylo100')
reg.array<-reg.array[order(reg.array[,'AIC','cc']),,]
plot(reg.array[1:dim(reg.array)[1],'estimate','cc'], pch=16, col='#e41a1c', bty='l', ylim=c(min(reg.array[,c("estimate", "2.5 %","97.5 %"),]),6), ylab='Coefficient', xaxt='n', xlab='')
points(((1:dim(reg.array)[1]+0.1)),reg.array[,'estimate','mlst'], pch=17, col='#377eb8')
points(((1:dim(reg.array)[1]+0.2)),reg.array[,'estimate','gpsc'], pch=18, col='#4daf4a')
points(((1:dim(reg.array)[1]+0.3)),reg.array[,'estimate','phylo300'], pch=18, col='#a65628')
points(((1:dim(reg.array)[1]+0.4)),reg.array[,'estimate','phylo200'], pch=18, col='#ff7f00')
points(((1:dim(reg.array)[1]+0.5)),reg.array[,'estimate','phylo100'], pch=18, col='#984ea3')

arrows(x0=1:dim(reg.array)[1],y0=reg.array[,"2.5 %",'cc'],y1=reg.array[,"97.5 %",'cc'], pch=16, col='#e41a1c', length=0)
arrows(x0=((1:dim(reg.array)[1]+0.1)),y0=reg.array[,"2.5 %",'mlst'],y1=reg.array[,"97.5 %",'mlst'], pch=16, col='#377eb8', length=0)
arrows(x0=((1:dim(reg.array)[1]+0.2)),y0=reg.array[,"2.5 %",'gpsc'],y1=reg.array[,"97.5 %",'gpsc'], pch=16, col='#4daf4a', length=0)
arrows(x0=((1:dim(reg.array)[1]+0.3)),y0=reg.array[,"2.5 %",'phylo300'],y1=reg.array[,"97.5 %",'phylo300'], pch=16, col='#a65628', length=0)
arrows(x0=((1:dim(reg.array)[1]+0.4)),y0=reg.array[,"2.5 %",'phylo200'],y1=reg.array[,"97.5 %",'phylo200'], pch=16, col='#ff7f00', length=0)
arrows(x0=((1:dim(reg.array)[1]+0.5)),y0=reg.array[,"2.5 %",'phylo100'],y1=reg.array[,"97.5 %",'phylo100'], pch=16, col='#984ea3', length=0)

abline(h=0, lty=2, col='gray')
text(1:12, y=-2.5, dimnames(reg.array)[[1]], pos=4, offset=0, xpd=NA, col='gray', cex=0.75)
legend("topright", legend=c("CC", "MLST", "GPSC", 'Phylo300', 'Phylo200', 'Phylo100'),       col=c("#e41a1c", "#377eb8", "#4daf4a",'#a65628','#ff7f00',  '#984ea3'), pch=c(16,17,18,18,18,18), box.lty=0)

```

## Simply quantify which serotypes are more/less diverse than expected based on their carriage
```{r}


#Function to evauate correlations
#diversity.re.func<-function(ds){
  ds= GPSC.sdi
 names(ds) <- c('st','sdi','N')
  st.cc.diversity<-merge(ds, carr1, by.x='st', by.y='ST', all=T)
  st.cc.diversity<-merge(st.cc.diversity, ps1, by.x='st', by.y='Serotype', all=T)
  st.cc.diversity$SLEEMANCARRN[is.na(st.cc.diversity$SLEEMANCARRN)]<-0
  st.cc.diversity$NORWAYCARR_PRE[is.na(st.cc.diversity$NORWAYCARR_PRE)]<-0
 # st.cc.diversity<-st.cc.diversity[!is.na(st.cc.diversity$st.cc.diversity),]
  #st.cc.diversity<-st.cc.diversity[!is.na(st.cc.diversity$Glc),]
  st.cc.diversity$sdi[st.cc.diversity$sdi==0]<-0.01
  st.cc.diversity$sdi[st.cc.diversity$sdi==1]<-0.99
  st.cc.diversity$logit.diversity<- log(st.cc.diversity$sdi/(1-st.cc.diversity$sdi))
 # plot(logit.diversity, log(st.cc.diversity$st.cc.sum))
  

p2<-ggplot(data=st.cc.diversity, aes(x=sqrt(SLEEMANCARRN) , y=logit.diversity)) +
  geom_text_repel(size=3,aes(label=st) , colour='gray70')+
    geom_point()+
  theme(panel.grid.major = element_blank(),legend.position = "none", panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black"))+
  xlab("Sleeman Carr (sqrt")+
  ylab('Serotype Diversity (GPSC)')
p2

```