Load the required libraries.
The correct approach for statistical testing for changes in phosphorylation will depend on the details of the experimental design. Here, we have quantified phospho and unmodified peptides, so we can identify changes in phosphorylation which are not explained by changes in the unmodified protein, e.g changes in the proportion of the protein that is phosphorylated.
To acheive this, we will jointly model the phospho and unmodified peptides which overlap each phosphosite, using a linear model with an interaction term.
We start by reading in the MSnSets created in the previous notebook QC PSM-level quantification, filtering and summarisation to protein-level abundance
total <- readRDS(here('results/total.rds'))
phospho_sites <- readRDS(here('results/phospho_sites.rds'))
Below, we identify the total peptides that overlap each phosphosite. Note that each phosphosite may intersect multiple total peptides (due to missed cleavages) and each total peptide may intersect multiple phosphosites (due to multiple phosphosites per peptide).
phospho_f <- phospho_sites %>% fData() %>% tibble::rownames_to_column('ID')
total_f <- total %>% fData() %>% tibble::rownames_to_column('ID')
phospho_peptide_to_total_peptides <- vector('list', length=length(phospho_f$ID))
names(phospho_peptide_to_total_peptides) <- phospho_f$ID
n <- 0
for(phospho_id in phospho_f$ID){
row <- phospho_f %>% filter(ID==phospho_id)
n <- n + 1
ptm_positions <- as.numeric(strsplit(row$ptm_position, split='\\|')[[1]])
total_peptide_ids <- total_f %>%
filter(Master.Protein.Accessions %in% row$Master.Protein.Accessions,
peptide_start<=min(ptm_positions),
peptide_end>=max(ptm_positions)) %>%
pull(ID)
phospho_peptide_to_total_peptides[[row$ID]] <- total_peptide_ids
}
Now, how many overlapping features. Note that for the phospho and total to be considered intersecting, there must be at least one total peptide which overlaps the phosphosite. Out of the 666 phosphosites, only 107 have at least one overlapping total peptide.
print(length(phospho_f$ID))
## [1] 666
print(table(sapply(phospho_peptide_to_total_peptides, function(x) length(x)>0)))
##
## FALSE TRUE
## 559 107
print(table(sapply(phospho_peptide_to_total_peptides, length)))
##
## 0 1 2 3
## 559 95 11 1
limma requires a single row per feature with a columns per sample. Here, the phospho and total are separate samples so we need to combine them into a single row.
combined_exprs <- phospho_peptide_to_total_peptides %>% names() %>% lapply(function(x){
total_peptides <- phospho_peptide_to_total_peptides[[x]]
if(length(total_peptides)>0) {
exprs_values <- c(exprs(phospho_sites[x,]),
colSums(exprs(total[total_peptides,]), na.rm=TRUE))
exprs_values <- unname(exprs_values)
return(exprs_values)
} else { return(NULL) }
})
names(combined_exprs) <- names(phospho_peptide_to_total_peptides)
Create the completed matrix of phospho and total.
combined_exprs <- combined_exprs[sapply(combined_exprs, function(x) !is.null(x))]
all_combined_exprs <- bind_rows(combined_exprs) %>% t() %>% as.matrix
rownames(all_combined_exprs) <- names(combined_exprs)
colnames(all_combined_exprs) <- c(paste0(colnames(phospho_sites), '_phospho'),
paste0(colnames(total), '_total'))
Inspect the combined quantification matrix.
head(all_combined_exprs) #
## 126_phospho 127N_phospho 127C_phospho 128N_phospho
## O00499_296|298|303 481.9 476.6 474.9 431.4
## O14639_452|455 246.2 231.8 246.9 236.5
## O60841_113 86.1 84.5 89.5 72.5
## O60841_164 320.5 308.4 291.1 275.6
## O60841_214 718.1 661.7 708.4 611.4
## O75554_258|262 490.5 475.2 497.3 414.3
## 128C_phospho 129N_phospho 129C_phospho 130N_phospho
## O00499_296|298|303 338.5 373.1 314.4 342.0
## O14639_452|455 175.9 184.5 167.9 181.8
## O60841_113 70.5 68.8 47.7 76.9
## O60841_164 229.5 249.1 239.9 246.6
## O60841_214 473.7 537.8 456.4 526.5
## O75554_258|262 327.8 350.0 335.4 362.6
## 130C_phospho 131_phospho 126_total 127N_total 127C_total
## O00499_296|298|303 333.7 335.6 59.2 53.2 65.7
## O14639_452|455 173.5 170.9 108.2 113.9 111.5
## O60841_113 70.0 66.6 91.4 79.0 95.6
## O60841_164 236.5 277.4 7.6 10.4 11.5
## O60841_214 518.8 504.3 70.2 65.3 77.1
## O75554_258|262 324.9 355.0 80.2 76.0 87.4
## 128N_total 128C_total 129N_total 129C_total 130N_total
## O00499_296|298|303 51.6 58.4 68.3 50.5 45.0
## O14639_452|455 106.3 106.0 122.4 86.0 79.0
## O60841_113 100.0 94.1 91.2 89.9 84.0
## O60841_164 11.5 11.7 8.8 9.7 8.2
## O60841_214 62.8 68.6 67.3 60.1 53.6
## O75554_258|262 65.7 71.0 88.9 78.3 76.5
## 130C_total 131_total
## O00499_296|298|303 54.4 47.1
## O14639_452|455 91.8 89.0
## O60841_113 75.7 81.4
## O60841_164 7.4 12.7
## O60841_214 63.7 56.5
## O75554_258|262 81.6 102.5
We also need to create feature data and phenotype data (experimental information) for our combined quantification matrix. From these, we can then create an MSnSet to hold all the combined quantification data.
all_combined_fdata <- fData(phospho_sites[rownames(all_combined_exprs),])[
,c('Master.Protein.Accessions', 'ptm_position')]
all_combined_fdata$n_total <- sapply(rownames(all_combined_exprs),
function(x) length(phospho_peptide_to_total_peptides[[x]]))
all_combined_pdata <- rbind((pData(phospho_sites) %>% mutate(type='phospho')),
(pData(total) %>% mutate(type='total')))
rownames(all_combined_pdata) <- colnames(all_combined_exprs)
all_combined_res <- MSnSet(as.matrix.data.frame(all_combined_exprs),
all_combined_fdata,
all_combined_pdata)
head(fData(all_combined_res))
## Master.Protein.Accessions ptm_position n_total
## O00499_296|298|303 O00499 296|298|303 1
## O14639_452|455 O14639 452|455 1
## O60841_113 O60841 113 1
## O60841_164 O60841 164 1
## O60841_214 O60841 214 1
## O75554_258|262 O75554 258|262 1
pData(all_combined_res)
## spike condition type
## 126_phospho x1 1 phospho
## 127N_phospho x1 1 phospho
## 127C_phospho x1 1 phospho
## 128N_phospho x1 1 phospho
## 128C_phospho x6 2 phospho
## 129N_phospho x6 2 phospho
## 129C_phospho x6 2 phospho
## 130N_phospho x6 3 phospho
## 130C_phospho x6 3 phospho
## 131_phospho x6 3 phospho
## 126_total x1 1 total
## 127N_total x1 1 total
## 127C_total x1 1 total
## 128N_total x1 1 total
## 128C_total x2 2 total
## 129N_total x2 2 total
## 129C_total x2 2 total
## 130N_total x6 3 total
## 130C_total x6 3 total
## 131_total x6 3 total
Below, we subset to the first 2 ‘conditions’, where condition 1 = tags 1-4 (x1 phospho; x1 total) and condition 2 = tags 5-7 (x6 phospho; x2 total). The ground truth for the difference between condition 2 and condition 1 is therefore a 3-fold increase in phosphorylation for the yeast proteins and reduced phosphorylation for the human proteins. For the human proteins, given the amounts of spike in yeast and balancing human proteins to make up the labelled material, we expect a 30% drop in phosphorylation.
pairwise_comparison_combined_res <- all_combined_res[,pData(all_combined_res)$condition %in% 1:2]
pData(pairwise_comparison_combined_res)
## spike condition type
## 126_phospho x1 1 phospho
## 127N_phospho x1 1 phospho
## 127C_phospho x1 1 phospho
## 128N_phospho x1 1 phospho
## 128C_phospho x6 2 phospho
## 129N_phospho x6 2 phospho
## 129C_phospho x6 2 phospho
## 126_total x1 1 total
## 127N_total x1 1 total
## 127C_total x1 1 total
## 128N_total x1 1 total
## 128C_total x2 2 total
## 129N_total x2 2 total
## 129C_total x2 2 total
Below, we perform the limma analysis. First though, some clarification on the model we’re using.
We have two experimental variables:
type: phospho or total
condition: 1x or 2x spike in
type <- factor(pData(pairwise_comparison_combined_res)$type, level=c('total', 'phospho'))
condition <- factor(pData(pairwise_comparison_combined_res)$condition, levels=1:2)
print(paste(type, condition))
## [1] "phospho 1" "phospho 1" "phospho 1" "phospho 1" "phospho 2" "phospho 2"
## [7] "phospho 2" "total 1" "total 1" "total 1" "total 1" "total 2"
## [13] "total 2" "total 2"
A simple additive model would estimate independent effects for the
type
and condition
variables
# model without interaction term
print(model.matrix(~type+condition))
## (Intercept) typephospho condition2
## 1 1 1 0
## 2 1 1 0
## 3 1 1 0
## 4 1 1 0
## 5 1 1 1
## 6 1 1 1
## 7 1 1 1
## 8 1 0 0
## 9 1 0 0
## 10 1 0 0
## 11 1 0 0
## 12 1 0 1
## 13 1 0 1
## 14 1 0 1
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$type
## [1] "contr.treatment"
##
## attr(,"contrasts")$condition
## [1] "contr.treatment"
If we want to explore a possible combinatorial effect, we need to
also include a type:condition
interaction term. In our
case, this captures the difference in abundance for phosphopeptides
between the conditions, which is specific for phosphopeptides and not
seen in the unmodified peptides.
# model with an interaction term
print(model.matrix(~type+condition+type:condition))
## (Intercept) typephospho condition2 typephospho:condition2
## 1 1 1 0 0
## 2 1 1 0 0
## 3 1 1 0 0
## 4 1 1 0 0
## 5 1 1 1 1
## 6 1 1 1 1
## 7 1 1 1 1
## 8 1 0 0 0
## 9 1 0 0 0
## 10 1 0 0 0
## 11 1 0 0 0
## 12 1 0 1 0
## 13 1 0 1 0
## 14 1 0 1 0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$type
## [1] "contr.treatment"
##
## attr(,"contrasts")$condition
## [1] "contr.treatment"
Below, we run the linear modeling. For more details about
limma
, see the Differential
abundance testing for TMT proteomics notebook.
dat <- exprs(pairwise_comparison_combined_res) %>% log(base=2)
study.design <- model.matrix(~type*condition)
fit <- lmFit(dat, study.design)
fit <- eBayes(fit, trend=TRUE)
limma.results <- topTable(fit, coef = colnames(fit$coefficients)[4], n = Inf, confint=TRUE)
limma.results$sigma <- fit$sigma
We would like to add information about the species to make sure the
fold-changes are in the expected direction. We’ll use the
uniprotREST
package to query
limma.results$protein <- rownames(limma.results) %>%
strsplit(split='_') %>%
sapply('[[', 1)
species_res <- uniprot_map(
ids = unique(limma.results$protein),
from = "UniProtKB_AC-ID",
to = "UniProtKB",
fields = "organism_name") %>%
mutate(species=recode(Organism,
"Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)"="S.cerevisiae",
"Homo sapiens (Human)"="H.sapiens"))
## Running job: e6f48376e0a73829d0b31e381f5d84a9191fa8da
## Checking job status...
## Job complete!
## Downloading: page 1 of 1
Below, we plot the limma results using a volcano plot, with two panels, one for each species.
limma.results %>%
merge(species_res, by.x='protein', by.y='From') %>%
ggplot(aes(logFC, -log10(P.Value), fill=adj.P.Val<0.01)) +
geom_point(pch=21, size=2, colour='grey70') +
facet_wrap(~species) +
theme_camprot(base_size=15, base_family='sans', border=FALSE) +
theme(strip.background=element_blank()) +
xlab('Log2 fold change') +
ylab('-log10(p-value)') +
scale_fill_manual(values=c('grey90', get_cat_palette(2)[2]), name='FDR < 1%')
The direction of change is as expected (reduced for human, increased for yeast), but only a subset of the fold-changes reach a 1% FDR threshold for significance. The fold-changes are appoximately what we would expect too: human phosphosites should be reduced by 30% = -0.51 on a log2 scale and the yeast phosphosites should be increased by 3-fold = 1.58 on a log2 scale.