library(cmapR)
library(vsn)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
#The GCT file was parsed using the parse function from cmapR package 
paad.gct <- parse_gctx('PAAD.gct')
## parsing as GCT v1.3
## PAAD.gct 18465 rows, 183 cols, 0 row descriptors, 124 col descriptors
#Object containing the normalised counts data for PAAD
es <- paad.gct@mat

#Evaluating the trend of the mean and variance of the normalised expression data
meanSdPlot(es, ranks = F)

#Removing the genes containing NaNs as they will add noise to the further analysis
es.complete <- es[complete.cases(es),]

print('After removing the expressions with low counts and NaNs, the plot shows removal of counts with low mean and higher variance as those can cause unwanted bias in the analysis' )
## [1] "After removing the expressions with low counts and NaNs, the plot shows removal of counts with low mean and higher variance as those can cause unwanted bias in the analysis"
meanSdPlot(es.complete, ranks = F)

print('Preparing the dataset for PCA')
## [1] "Preparing the dataset for PCA"
print('We want our plots to be readable so we will add labels and color coding for the various histological subtypes')
## [1] "We want our plots to be readable so we will add labels and color coding for the various histological subtypes"
#Preparing data for PCA
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == '82463 neuroendocrine carcinoma nos'] <- 'N'
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'adenocarcinoma- nos'] <- 'A' 
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'adenocarcinoma, nos'] <- 'A' 
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'ductal and micropapillary'] <- 'DMP' 
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'intraductal tubulopapillary neoplasm'] <- 'ITN' 
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'invasive adenocarcinoma'] <- 'IA' 
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'invasive, well-differentiated'] <- 'IWD' 
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'moderately differentiated ductal adenocarcinoma 60% + neuroendocrine 40%'] <- 'MD'
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'neuroendocrine'] <- 'N'
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'neuroendocrine carcinoma'] <- 'N'
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'neuroendocrine carcinoma nos'] <- 'N' 
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'not specified'] <- 'X' 
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'poorly differentiated adenocarcinoma'] <- 'X' 
paad.gct@cdesc$label[paad.gct@cdesc$histological_type_other == 'poorly differentiated pancreatic adenocarcinoma'] <- 'X' 

#Color Coding
paad.gct@cdesc$color[paad.gct@cdesc$label == 'A'] <- 'blue'
paad.gct@cdesc$color[paad.gct@cdesc$label == 'N'] <- 'red'
paad.gct@cdesc$color[paad.gct@cdesc$label == 'DMP'] <- 'green'
paad.gct@cdesc$color[paad.gct@cdesc$label == 'ITN'] <- 'yellow'
paad.gct@cdesc$color[paad.gct@cdesc$label == 'IA'] <- 'cyan'
paad.gct@cdesc$color[paad.gct@cdesc$label == 'IWD'] <- 'brown'
paad.gct@cdesc$color[paad.gct@cdesc$label == 'MD'] <- 'gray'
paad.gct@cdesc$color[paad.gct@cdesc$label == 'X'] <- 'black'

#Frequency of histological subtypes in the dataset
group <- data.frame(cbind(paad.gct@cdesc$histological_type_other,
                          paad.gct@cdesc$label,
                          paad.gct@cdesc$color))
group <- na.omit(group)
group <- unique(group)
colnames(group) <- c('Histological Type (Other)','Label','Color')

print('-----------------PRINICIPAL COMPONENT ANALYSIS----------------')
## [1] "-----------------PRINICIPAL COMPONENT ANALYSIS----------------"
library(stats)
#PCA 
pca.paad <- prcomp(t(es.complete))

## Get the standard deviation and variance per principal component
sd.per.pc <- pca.paad$sdev
var.per.pc <- sd.per.pc^2

## Display the percentage of total variance explained by each 
sd.per.pc.percent <- sd.per.pc/sum(sd.per.pc)
var.per.pc.percent <- var.per.pc/sum(var.per.pc)

print('Prinicpal Component 1 has a 17% variance meaning that variables along this component
      explains 17% variance amongst the various histological types,
      whereas Prinicipal Component 2 has a variance of ~15% which means that variables along this component which is
      orthogonal to PC1 explains ~15% variation amongst the histological subtypes')
## [1] "Prinicpal Component 1 has a 17% variance meaning that variables along this component\n      explains 17% variance amongst the various histological types,\n      whereas Prinicipal Component 2 has a variance of ~15% which means that variables along this component which is\n      orthogonal to PC1 explains ~15% variation amongst the histological subtypes"
barplot(var.per.pc.percent[1:10], main='PAAD, Percent of variance  per component', xlab='Component', ylab='Percent variance', col='#BBDDFF')

#Summary of the Histological Types and their corresonding labels
flextable::flextable(group)

Histological Type (Other)

Label

Color

invasive adenocarcinoma

IA

cyan

invasive, well-differentiated

IWD

brown

poorly differentiated adenocarcinoma

X

black

neuroendocrine

N

red

neuroendocrine carcinoma nos

N

red

82463 neuroendocrine carcinoma nos

N

red

neuroendocrine carcinoma

N

red

adenocarcinoma, nos

A

blue

poorly differentiated pancreatic adenocarcinoma

X

black

not specified

X

black

intraductal tubulopapillary neoplasm

ITN

yellow

ductal and micropapillary

DMP

green

adenocarcinoma- nos

A

blue

moderately differentiated ductal adenocarcinoma 60% + neuroendocrine 40%

MD

gray

## Plot components PC1 and PC2
plot(pca.paad$x[,1:2],
     type='n',
     panel.first=c(grid(col='black')),
     main=paste('PCA; PAAD; ',
                ncol(es.complete), 'samples *', nrow(es.complete), 'genes', sep=' '), 
     xlab=paste0('PC1 ',round(var.per.pc.percent[[1]]*100,2),'%'), 
     ylab=paste0('PC2 ',round(var.per.pc.percent[[2]]*100,2),'%'))
text(pca.paad$x[,1:2],labels=paad.gct@cdesc$label,col=paad.gct@cdesc$color,pch=0.3)

print('From the plot we can clearly see a seperation between Neuroendocrine samples (N) and the adenocarcinoma samples (A)')
## [1] "From the plot we can clearly see a seperation between Neuroendocrine samples (N) and the adenocarcinoma samples (A)"
#Removing the neuroendocrine tumor samples from the dataset
#Extract sample IDs containing Neuroendocrine samples

IDs <- rownames(paad.gct@cdesc)[paad.gct@cdesc$label == 'N']
IDs <- na.omit(IDs)

print('The IDs corresponding to the Neuroendocrine Tumors could not be removed')
## [1] "The IDs corresponding to the Neuroendocrine Tumors could not be removed"
print('CONSTRAINT: Unale to subset S4 Object: Restrictive binding of object in an S4 class')
## [1] "CONSTRAINT: Unale to subset S4 Object: Restrictive binding of object in an S4 class"
# GENE EXPRESSION DATA ANALYSIS USING TYPE1-IFNs
IFN <- read.table("D:/RProjects/PPAD/type1_IFN.txt", quote="\"", comment.char="")
IFN_genes <- as.character(IFN$V1)
#Plot of the gene expression for IFN genes
es.IFN <- es[IFN_genes,]

boxplot(t(es.IFN), xlab="", ylab="Normalised Counts",las=2)

#Running GSVA
library(GSVA)
#Evaluating whether a gene is highly or lowly expressed in sample each in the context
#of the sample population distribution
es.max <- gsva(es, as.list(IFN_genes), mx.diff=FALSE, verbose=FALSE, parallel.sz=1)
## Warning in .local(expr, gset.idx.list, ...): 4367 genes with constant
## expression values throuhgout the samples.
## Warning in .local(expr, gset.idx.list, ...): Since argument method!
## ="ssgsea", genes with constant expression values are discarded.
es.dif <- gsva(es, as.list(IFN_genes), mx.diff=TRUE, verbose=FALSE, parallel.sz=1)
## Warning in .local(expr, gset.idx.list, ...): 4367 genes with constant
## expression values throuhgout the samples.

## Warning in .local(expr, gset.idx.list, ...): Since argument method!
## ="ssgsea", genes with constant expression values are discarded.
par(mfrow=c(1,2), mar=c(4, 4, 4, 1))
plot(density(as.vector(es.max)), main="Maximum deviation from zero",
       xlab="GSVA score", lwd=2, las=1, xaxt="n", xlim=c(-0.75, 0.75), cex.axis=0.8)
axis(1, at=seq(-0.75, 0.75, by=0.25), labels=seq(-0.75, 0.75, by=0.25), cex.axis=0.8)
plot(density(as.vector(es.dif)), main="Difference between largest\npositive and negative deviations",
       xlab="GSVA score", lwd=2, las=1, xaxt="n", xlim=c(-0.75, 0.75), cex.axis=0.8)
axis(1, at=seq(-0.75, 0.75, by=0.25), labels=seq(-0.75, 0.75, by=0.25), cex.axis=0.8)