Load dependencies

Load the required libraries.

Preamble

The correct approach for statistical testing for changes in phosphorylation will depend on the details of the experimental design. Here, we have quantified phospho and unmodified peptides, so we can identify changes in phosphorylation which are not explained by changes in the unmodified protein, e.g changes in the proportion of the protein that is phosphorylated.

To acheive this, we will jointly model the phospho and unmodified peptides which overlap each phosphosite, using a linear model with an interaction term.

Input data

We start by reading in the MSnSets created in the previous notebook QC PSM-level quantification, filtering and summarisation to protein-level abundance

total <- readRDS(here('results/total.rds'))
phospho_sites <- readRDS(here('results/phospho_sites.rds'))

Below, we identify the total peptides that overlap each phosphosite. Note that each phosphosite may intersect multiple total peptides (due to missed cleavages) and each total peptide may intersect multiple phosphosites (due to multiple phosphosites per peptide).

phospho_f <- phospho_sites %>% fData() %>% tibble::rownames_to_column('ID')
total_f <- total %>% fData() %>% tibble::rownames_to_column('ID')

phospho_peptide_to_total_peptides <- vector('list', length=length(phospho_f$ID))
names(phospho_peptide_to_total_peptides) <- phospho_f$ID
n <- 0
for(phospho_id in phospho_f$ID){
  row <- phospho_f %>% filter(ID==phospho_id)
  
  n <- n + 1
  ptm_positions <- as.numeric(strsplit(row$ptm_position, split='\\|')[[1]])
  
  total_peptide_ids <- total_f %>%
    filter(Master.Protein.Accessions %in% row$Master.Protein.Accessions,
           peptide_start<=min(ptm_positions),
           peptide_end>=max(ptm_positions)) %>%
    pull(ID)  
  
  phospho_peptide_to_total_peptides[[row$ID]] <- total_peptide_ids
}

Now, how many overlapping features. Note that for the phospho and total to be considered intersecting, there must be at least one total peptide which overlaps the phosphosite. Out of the 666 phosphosites, only 107 have at least one overlapping total peptide.

print(length(phospho_f$ID))
## [1] 666
print(table(sapply(phospho_peptide_to_total_peptides, function(x) length(x)>0)))
## 
## FALSE  TRUE 
##   559   107
print(table(sapply(phospho_peptide_to_total_peptides, length)))
## 
##   0   1   2   3 
## 559  95  11   1

limma requires a single row per feature with a columns per sample. Here, the phospho and total are separate samples so we need to combine them into a single row.

combined_exprs <- phospho_peptide_to_total_peptides %>% names() %>% lapply(function(x){
  total_peptides <- phospho_peptide_to_total_peptides[[x]]
  
  if(length(total_peptides)>0) {
    exprs_values <- c(exprs(phospho_sites[x,]),
                      colSums(exprs(total[total_peptides,]), na.rm=TRUE))
    exprs_values <- unname(exprs_values)
    return(exprs_values)
  } else { return(NULL) }
})
names(combined_exprs) <- names(phospho_peptide_to_total_peptides)

Create the completed matrix of phospho and total.

combined_exprs <- combined_exprs[sapply(combined_exprs, function(x) !is.null(x))]
all_combined_exprs <- bind_rows(combined_exprs) %>% t() %>% as.matrix

rownames(all_combined_exprs) <- names(combined_exprs)

colnames(all_combined_exprs) <- c(paste0(colnames(phospho_sites), '_phospho'),
                                  paste0(colnames(total), '_total'))

Inspect the combined quantification matrix.

head(all_combined_exprs) #
##                    126_phospho 127N_phospho 127C_phospho 128N_phospho
## O00499_296|298|303       481.9        476.6        474.9        431.4
## O14639_452|455           246.2        231.8        246.9        236.5
## O60841_113                86.1         84.5         89.5         72.5
## O60841_164               320.5        308.4        291.1        275.6
## O60841_214               718.1        661.7        708.4        611.4
## O75554_258|262           490.5        475.2        497.3        414.3
##                    128C_phospho 129N_phospho 129C_phospho 130N_phospho
## O00499_296|298|303        338.5        373.1        314.4        342.0
## O14639_452|455            175.9        184.5        167.9        181.8
## O60841_113                 70.5         68.8         47.7         76.9
## O60841_164                229.5        249.1        239.9        246.6
## O60841_214                473.7        537.8        456.4        526.5
## O75554_258|262            327.8        350.0        335.4        362.6
##                    130C_phospho 131_phospho 126_total 127N_total 127C_total
## O00499_296|298|303        333.7       335.6      59.2       53.2       65.7
## O14639_452|455            173.5       170.9     108.2      113.9      111.5
## O60841_113                 70.0        66.6      91.4       79.0       95.6
## O60841_164                236.5       277.4       7.6       10.4       11.5
## O60841_214                518.8       504.3      70.2       65.3       77.1
## O75554_258|262            324.9       355.0      80.2       76.0       87.4
##                    128N_total 128C_total 129N_total 129C_total 130N_total
## O00499_296|298|303       51.6       58.4       68.3       50.5       45.0
## O14639_452|455          106.3      106.0      122.4       86.0       79.0
## O60841_113              100.0       94.1       91.2       89.9       84.0
## O60841_164               11.5       11.7        8.8        9.7        8.2
## O60841_214               62.8       68.6       67.3       60.1       53.6
## O75554_258|262           65.7       71.0       88.9       78.3       76.5
##                    130C_total 131_total
## O00499_296|298|303       54.4      47.1
## O14639_452|455           91.8      89.0
## O60841_113               75.7      81.4
## O60841_164                7.4      12.7
## O60841_214               63.7      56.5
## O75554_258|262           81.6     102.5

We also need to create feature data and phenotype data (experimental information) for our combined quantification matrix. From these, we can then create an MSnSet to hold all the combined quantification data.

all_combined_fdata <- fData(phospho_sites[rownames(all_combined_exprs),])[
  ,c('Master.Protein.Accessions', 'ptm_position')]
all_combined_fdata$n_total <- sapply(rownames(all_combined_exprs),
                                     function(x) length(phospho_peptide_to_total_peptides[[x]]))

all_combined_pdata <- rbind((pData(phospho_sites) %>% mutate(type='phospho')),
                            (pData(total) %>% mutate(type='total')))
rownames(all_combined_pdata) <- colnames(all_combined_exprs)
all_combined_res <- MSnSet(as.matrix.data.frame(all_combined_exprs),
                           all_combined_fdata,
                           all_combined_pdata)

head(fData(all_combined_res))
##                    Master.Protein.Accessions ptm_position n_total
## O00499_296|298|303                    O00499  296|298|303       1
## O14639_452|455                        O14639      452|455       1
## O60841_113                            O60841          113       1
## O60841_164                            O60841          164       1
## O60841_214                            O60841          214       1
## O75554_258|262                        O75554      258|262       1
pData(all_combined_res)
##              spike condition    type
## 126_phospho     x1         1 phospho
## 127N_phospho    x1         1 phospho
## 127C_phospho    x1         1 phospho
## 128N_phospho    x1         1 phospho
## 128C_phospho    x6         2 phospho
## 129N_phospho    x6         2 phospho
## 129C_phospho    x6         2 phospho
## 130N_phospho    x6         3 phospho
## 130C_phospho    x6         3 phospho
## 131_phospho     x6         3 phospho
## 126_total       x1         1   total
## 127N_total      x1         1   total
## 127C_total      x1         1   total
## 128N_total      x1         1   total
## 128C_total      x2         2   total
## 129N_total      x2         2   total
## 129C_total      x2         2   total
## 130N_total      x6         3   total
## 130C_total      x6         3   total
## 131_total       x6         3   total

Below, we subset to the first 2 ‘conditions’, where condition 1 = tags 1-4 (x1 phospho; x1 total) and condition 2 = tags 5-7 (x6 phospho; x2 total). The ground truth for the difference between condition 2 and condition 1 is therefore a 3-fold increase in phosphorylation for the yeast proteins and reduced phosphorylation for the human proteins. For the human proteins, given the amounts of spike in yeast and balancing human proteins to make up the labelled material, we expect a 30% drop in phosphorylation.

pairwise_comparison_combined_res <- all_combined_res[,pData(all_combined_res)$condition %in% 1:2]
pData(pairwise_comparison_combined_res)
##              spike condition    type
## 126_phospho     x1         1 phospho
## 127N_phospho    x1         1 phospho
## 127C_phospho    x1         1 phospho
## 128N_phospho    x1         1 phospho
## 128C_phospho    x6         2 phospho
## 129N_phospho    x6         2 phospho
## 129C_phospho    x6         2 phospho
## 126_total       x1         1   total
## 127N_total      x1         1   total
## 127C_total      x1         1   total
## 128N_total      x1         1   total
## 128C_total      x2         2   total
## 129N_total      x2         2   total
## 129C_total      x2         2   total

Run limma

Below, we perform the limma analysis. First though, some clarification on the model we’re using.

We have two experimental variables:

type <- factor(pData(pairwise_comparison_combined_res)$type, level=c('total', 'phospho'))
condition <- factor(pData(pairwise_comparison_combined_res)$condition, levels=1:2)
print(paste(type, condition))
##  [1] "phospho 1" "phospho 1" "phospho 1" "phospho 1" "phospho 2" "phospho 2"
##  [7] "phospho 2" "total 1"   "total 1"   "total 1"   "total 1"   "total 2"  
## [13] "total 2"   "total 2"

A simple additive model would estimate independent effects for the type and condition variables

# model without interaction term
print(model.matrix(~type+condition))
##    (Intercept) typephospho condition2
## 1            1           1          0
## 2            1           1          0
## 3            1           1          0
## 4            1           1          0
## 5            1           1          1
## 6            1           1          1
## 7            1           1          1
## 8            1           0          0
## 9            1           0          0
## 10           1           0          0
## 11           1           0          0
## 12           1           0          1
## 13           1           0          1
## 14           1           0          1
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$type
## [1] "contr.treatment"
## 
## attr(,"contrasts")$condition
## [1] "contr.treatment"

If we want to explore a possible combinatorial effect, we need to also include a type:condition interaction term. In our case, this captures the difference in abundance for phosphopeptides between the conditions, which is specific for phosphopeptides and not seen in the unmodified peptides.

# model with an interaction term
print(model.matrix(~type+condition+type:condition))
##    (Intercept) typephospho condition2 typephospho:condition2
## 1            1           1          0                      0
## 2            1           1          0                      0
## 3            1           1          0                      0
## 4            1           1          0                      0
## 5            1           1          1                      1
## 6            1           1          1                      1
## 7            1           1          1                      1
## 8            1           0          0                      0
## 9            1           0          0                      0
## 10           1           0          0                      0
## 11           1           0          0                      0
## 12           1           0          1                      0
## 13           1           0          1                      0
## 14           1           0          1                      0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$type
## [1] "contr.treatment"
## 
## attr(,"contrasts")$condition
## [1] "contr.treatment"

Below, we run the linear modeling. For more details about limma, see the Differential abundance testing for TMT proteomics notebook.

dat <- exprs(pairwise_comparison_combined_res) %>% log(base=2)
study.design <- model.matrix(~type*condition)

fit <- lmFit(dat, study.design)
fit <- eBayes(fit, trend=TRUE)

limma.results <- topTable(fit, coef = colnames(fit$coefficients)[4], n = Inf, confint=TRUE)
limma.results$sigma <- fit$sigma

We would like to add information about the species to make sure the fold-changes are in the expected direction. We’ll use the uniprotREST package to query

limma.results$protein <- rownames(limma.results) %>%
  strsplit(split='_') %>%
  sapply('[[', 1) 

species_res <-  uniprot_map(
  ids = unique(limma.results$protein),
  from = "UniProtKB_AC-ID",
  to = "UniProtKB",
  fields = "organism_name") %>%
  mutate(species=recode(Organism,
                        "Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)"="S.cerevisiae",
                        "Homo sapiens (Human)"="H.sapiens"))
## Running job: e6f48376e0a73829d0b31e381f5d84a9191fa8da 
## Checking job status...
## Job complete!
##  Downloading: page 1 of 1

Below, we plot the limma results using a volcano plot, with two panels, one for each species.

limma.results %>%
  merge(species_res, by.x='protein', by.y='From') %>%
  ggplot(aes(logFC, -log10(P.Value), fill=adj.P.Val<0.01)) +
  geom_point(pch=21, size=2, colour='grey70') +
  facet_wrap(~species) +
  theme_camprot(base_size=15, base_family='sans', border=FALSE) +
  theme(strip.background=element_blank()) +
  xlab('Log2 fold change') +
  ylab('-log10(p-value)') +
  scale_fill_manual(values=c('grey90', get_cat_palette(2)[2]), name='FDR < 1%')

The direction of change is as expected (reduced for human, increased for yeast), but only a subset of the fold-changes reach a 1% FDR threshold for significance. The fold-changes are appoximately what we would expect too: human phosphosites should be reduced by 30% = -0.51 on a log2 scale and the yeast phosphosites should be increased by 3-fold = 1.58 on a log2 scale.