library(rtracklayer)
esc_K27ac_bed <- import("HUES64_E016/E016-H3K27ac.narrowPeak.gz")
mesoderm_K27ac_bed <- import("meso_HUES64_E013/E013-H3K27ac.narrowPeak.gz")
#filter by top 10%
top10_score_esc <- quantile(mcols(esc_K27ac_bed)$score, 0.9)
esc_K27ac_bed_filter <- esc_K27ac_bed[esc_K27ac_bed$score > top10_score_esc]
top10_score_meso <- quantile(mcols(mesoderm_K27ac_bed)$score, 0.9)
mesoderm_K27ac_bed_filter <- mesoderm_K27ac_bed[mesoderm_K27ac_bed$score > top10_score_meso]
K27ac_list <- GRangesList(esc_K27ac_bed_filter, mesoderm_K27ac_bed_filter)
K27ac_list_unlist <- unlist(K27ac_list)
K27ac_list_reduce <- reduce(K27ac_list_unlist)
K27ac_list_unlist
## GRanges object with 24996 ranges and 6 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr2 70311400-70317932 * | Rank_1 3420
## [2] chr1 156715984-156717022 * | Rank_2 3270
## [3] chr1 167188334-167189965 * | Rank_3 3128
## [4] chr1 153606439-153607319 * | Rank_4 3120
## [5] chr17 47071810-47077289 * | Rank_5 3007
## ... ... ... ... . ... ...
## [24992] chr12 25996117-25997192 * | Rank_13921 361
## [24993] chr4 105415664-105416898 * | Rank_13922 361
## [24994] chr4 58275980-58277464 * | Rank_13923 361
## [24995] chr8 120586061-120588309 * | Rank_13924 361
## [24996] chr8 19674901-19675905 * | Rank_13925 361
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <integer>
## [1] 57.06192 342.00323 332.51266 2125
## [2] 55.11938 327.03815 318.94553 765
## [3] 47.80416 312.89105 305.57074 1299
## [4] 46.54394 312.0542 304.77844 238
## [5] 56.15541 300.79483 293.95358 1427
## ... ... ... ... ...
## [24992] 11.98042 36.11326 33.49098 676
## [24993] 11.98042 36.11326 33.49098 813
## [24994] 11.98042 36.11326 33.49098 1012
## [24995] 11.98042 36.11326 33.49098 252
## [24996] 11.98042 36.11326 33.49098 213
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
set.seed(0)
K27ac_list_reduce_sample <- K27ac_list_reduce[sample(seq_along(K27ac_list_reduce), 1000)]
export.bed(K27ac_list_reduce_sample, "K27ac_top10_HUES64.bed")
signalFiles_deeptools_input <- paste(signalFiles, collapse = " ")
bedFiles_deeptools_input <- paste(testRanges, collapse = " ")
system(paste("computeMatrix scale-regions -R",
bedFiles_deeptools_input,
"-S",
signalFiles_deeptools_input,
"-o ./compute_matrix_HUES64",
sep = " "))
library(profileplyr)
library(grid)
esc_files <- list.files("HUES64_E016/", full.names = TRUE)
meso_files <- list.files("meso_HUES64_E013/", full.names = TRUE)
esc_bigwigs <- esc_files[grepl("bigwig", esc_files)]
meso_bigwigs <- meso_files[grepl("bigwig", meso_files)]
signalFiles <- c(esc_bigwigs, meso_bigwigs)
# chunk not evaluated
chipProfile <- BamBigwig_to_chipProfile(signalFiles = signalFiles,
testRanges = testRanges,
format = "bigwig",
style = "percentOfRegion",
nOfWindows = 50,
distanceAround = 100)
# Convert this output to profileplyr object
proplyr_object <- as_profileplyr(chipProfile)
## class: profileplyr
## dim: 1000 150
## metadata(0):
## assays(10): E016_WGBS_FractionalMethylation.bigwig
## E016-H3K27ac.pval.signal.bigwig ... E013-H3K4me1.pval.signal.bigwig
## E013-H3K4me3.pval.signal.bigwig
## rownames(1000): giID1 giID10 ... giID998 giID999
## rowData names(5): name score sgGroup giID names
## colnames(150): Start-1 Start-2 ... End+49 End+50
## colData names(0):
## GRanges object with 2 ranges and 5 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## giID1 chr21 34696445-34697205 + | <NA> 0
## giID10 chr21 36207782-36209962 + | <NA> 0
## sgGroup giID names
## <factor> <factor> <factor>
## giID1 K27ac_top10_HUES64.bed giID1 <NA>
## giID10 K27ac_top10_HUES64.bed giID10 <NA>
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
## DataFrame with 2 rows and 5 columns
## name score sgGroup giID names
## <character> <numeric> <factor> <factor> <factor>
## giID1 NA 0 K27ac_top10_HUES64.bed giID1 NA
## giID10 NA 0 K27ac_top10_HUES64.bed giID10 NA
## [1] 1000 150
## Start-1 Start-2 Start-3
## giID1 0.00000000 0.00000000 0.00000000
## giID10 0.08181818 0.08545455 0.13090909
## giID100 0.06964286 0.00000000 0.03035714
## DataFrame with 3 rows and 19 columns
## unscaled.5.prime body
## <numeric> <numeric>
## E016_WGBS_FractionalMethylation.bigwig 0 0
## E016-H3K27ac.pval.signal.bigwig 0 0
## E016-H3K27me3.pval.signal.bigwig 0 0
## sample_labels
## <factor>
## E016_WGBS_FractionalMethylation.bigwig E016_WGBS_FractionalMethylation.bigwig
## E016-H3K27ac.pval.signal.bigwig E016-H3K27ac.pval.signal.bigwig
## E016-H3K27me3.pval.signal.bigwig E016-H3K27me3.pval.signal.bigwig
## downstream unscaled.3.prime bin.size
## <numeric> <numeric> <numeric>
## E016_WGBS_FractionalMethylation.bigwig 75 0 1
## E016-H3K27ac.pval.signal.bigwig 75 0 1
## E016-H3K27me3.pval.signal.bigwig 75 0 1
## upstream ref.point verbose scale
## <numeric> <factor> <logical> <numeric>
## E016_WGBS_FractionalMethylation.bigwig 75 center FALSE 1
## E016-H3K27ac.pval.signal.bigwig 75 center FALSE 1
## E016-H3K27me3.pval.signal.bigwig 75 center FALSE 1
## skip.zeros nan.after.end sort.using
## <logical> <logical> <factor>
## E016_WGBS_FractionalMethylation.bigwig FALSE FALSE mean
## E016-H3K27ac.pval.signal.bigwig FALSE FALSE mean
## E016-H3K27me3.pval.signal.bigwig FALSE FALSE mean
## sort.regions proc.number bin.avg.type
## <factor> <numeric> <factor>
## E016_WGBS_FractionalMethylation.bigwig keep 11 mean
## E016-H3K27ac.pval.signal.bigwig keep 11 mean
## E016-H3K27me3.pval.signal.bigwig keep 11 mean
## missing.data.as.zero min.threshold
## <logical> <list>
## E016_WGBS_FractionalMethylation.bigwig FALSE NULL
## E016-H3K27ac.pval.signal.bigwig FALSE NULL
## E016-H3K27me3.pval.signal.bigwig FALSE NULL
## max.threshold
## <list>
## E016_WGBS_FractionalMethylation.bigwig NULL
## E016-H3K27ac.pval.signal.bigwig NULL
## E016-H3K27me3.pval.signal.bigwig NULL
row.names(sampleData(proplyr_object)) <- c("WGBS_esc","K27ac_esc", "K27me3_esc", "K4me1_esc", "K4me3_esc",
"WGBS_meso","K27ac_meso", "K27me3_meso", "K4me1_meso", "K4me3_meso")
sampleData(proplyr_object)$tissue <- c(rep("esc", 5), rep("meso", 5))
sampleData(proplyr_object)$chip <- c(rep(c("WGBS","K27ac","K27me3", "K4me1", "K4me3"), 2))
sampleData(proplyr_object)[1:3, 20:21]
## DataFrame with 3 rows and 2 columns
## tissue chip
## <character> <character>
## WGBS_esc esc WGBS
## K27ac_esc esc K27ac
## K27me3_esc esc K27me3
A profileplyr object can always be exported as a deepTools matrix using the ‘export_deepToolsMat’ function, and the resulting matrix after export object will maintain sample labels, sample subsetting, and range groupings. While here we haven’t done much to the object, this may be useful if you prefer deepTools heatmaps but want to use Bioconductor/R for manipulating the ranges
export_deepToolsMat(proplyr_object_subset,
con = "./exported_deepTools_matrix",
overwrite = TRUE)
system("plotHeatmap -m ./exported_deepTools_matrix.gz -o deepTools_heatmap_simple.pdf")
The profileplyr object can always be directly visualized in a range based heatmap within R using the ‘generateEnrichedHeatmap’ function
Change the colors of all heatmaps to the same scheme (white and black):
generateEnrichedHeatmap(proplyr_object_subset,
include_group_annotation = FALSE,
matrices_color = c("white", "black")
)
Note that if the ‘matrices_color’ argument is a list of vectors that is the same length as the number of heatmaps, then you can set each individual heatmap colors manually.
generateEnrichedHeatmap(proplyr_object_subset,
include_group_annotation = FALSE,
matrices_color = list(c("white", "black"),
c("white", "red"),
c("white", "blue"),
c("white", "green"),
c("white", "purple"),
c("white", "pink")
)
)
It is also important to consider the color scale of each heatmap, not just the color. By default the color is scaled based on the max signal of all heatmaps. This is controlled by the ‘all_color_scales_equal’ argument, which is by default TRUE. When set to FALSE, each heatmap will be scaled individually, and this will be reflected by the legends to the right of the figure.
generateEnrichedHeatmap(proplyr_object_subset,
include_group_annotation = FALSE,
matrices_color = list(c("white", "black"),
c("white", "red"),
c("white", "blue"),
c("white", "green"),
c("white", "purple"),
c("white", "pink")
),
all_color_scales_equal = FALSE
)
The ‘ylim’ argument will change the limits of the y-axis of the plot on top of each heatmap. If ‘ylim = “common_max”’, then the max will be set to slightly higher than the highest signal in all heatmaps. This is the default behavior.
This argument also works similar to the ‘matrices_color’ argument above, where a single character vector (e.g. c(0,10)) will set the limits to all plots to be the same, and a list of vectors that is the same length as the number of heatmaps will customize the limits of the y-axis for each individual heatmap.
Instead of manually setting the limit of each plot, it will likely be easier to set ‘ylim’ to be ‘NULL’. In this case the max for each heatmap will be inferred.
generateEnrichedHeatmap(proplyr_object_subset,
include_group_annotation = FALSE,
matrices_color = list(c("white", "black"),
c("white", "red"),
c("white", "blue"),
c("white", "green"),
c("white", "purple"),
c("white", "pink")
),
all_color_scales_equal = FALSE,
ylim = NULL
)
We will use the column titled “chip” that we created previously. Also note that the color scales are also set by the sample groups as well.
For the sake of space, and if you have different color scales for a figure with many heatmap panels, the legends can get a bit overwhelming and it might make sense to remove some that aren’t terribly informative. This can be done with the ‘show_heatmap_legend’ argument of the generateEnrichedHeatmap() function. Note that only the legends of the heatmaps representing the range matrices can be removed, while the legends for additional annotation columns will always remain. This argument takes a logical vector or equal length to the number of heatmaps, with each position corresponding the the heatmap in that position of the profileplyr object.
generateEnrichedHeatmap(proplyr_object_subset,
include_group_annotation = FALSE,
color_by_sample_group = "chip",
ylim = "chip",
show_heatmap_legend = rep(FALSE, 6)
)
If the ‘output’ argument is set to ‘matrix’, then only a matrix will be returned with a single column for each sample containing the bins summarized as indicated with the ‘fun’ argument. The row names of this matrix is a unique identifier for each range containing the chromosome, start, end, and group.
proplyr_object_subset_sumMat <- profileplyr::summarize(proplyr_object_subset,
fun = rowMeans,
output = "matrix")
proplyr_object_subset_sumMat[1:3, ]
## K27ac_esc K4me1_esc
## chr21_34696445_34697205_K27ac_top10_HUES64.bed 4.0326237 0.327227
## chr21_36207782_36209962_K27ac_top10_HUES64.bed 0.9740337 2.296254
## chr7_150734773_150737527_K27ac_top10_HUES64.bed 0.5168534 0.308683
## K4me3_esc K27ac_meso
## chr21_34696445_34697205_K27ac_top10_HUES64.bed 104.73912667 15.517333
## chr21_36207782_36209962_K27ac_top10_HUES64.bed 0.35388878 8.081078
## chr7_150734773_150737527_K27ac_top10_HUES64.bed 0.09848487 11.528448
## K4me1_meso K4me3_meso
## chr21_34696445_34697205_K27ac_top10_HUES64.bed 0.2341562 122.2847307
## chr21_36207782_36209962_K27ac_top10_HUES64.bed 3.6760741 0.9602948
## chr7_150734773_150737527_K27ac_top10_HUES64.bed 2.6178709 0.3257963
This matrix can be used directly in other heatmap generating packages, including heatmap or pheatmap.
library(pheatmap)
pheatmap(proplyr_object_subset_sumMat,
scale = "row",
cluster_cols = FALSE,
show_rownames = FALSE)
If the ‘output’ argument is set to ‘long’, then the output will be a long data frame that can be used for plotting with ggplot. The grouping column of the range metadata as specified by ‘params(proplyrObject)$rowGroupsInUse’ will automatically be included in the data frame. If the other range metadata columns should be included in the data frame, then the ‘keep_all_mcols’ argument should be set to TRUE. Additionally, columns specifying the range, as well as the sample and the summarized signal that correspond to that range are included by default.
proplyr_object_subset_long <- profileplyr::summarize(proplyr_object_subset,
fun = rowMeans,
output = "long")
proplyr_object_subset_long[1:3, ]
## sgGroup combined_ranges Sample Signal
## 1 K27ac_top10_HUES64.bed chr21_34696445_34697205 K27ac_esc 4.0326237
## 2 K27ac_top10_HUES64.bed chr21_36207782_36209962 K27ac_esc 0.9740337
## 3 K27ac_top10_HUES64.bed chr7_150734773_150737527 K27ac_esc 0.5168534
Note: It is often helpful to log transform the signal to more clearly see trends in the signal that is quantified.
Lastly, if the ‘output’ argument is set to ‘object’, then a profileplyr object containing the summarized matrix will be returned. This will allow for further grouping or manipulation of the summarized ranges with other profileplyr functions, as opposed to using the binned ranges that are often used in later examples.
proplyr_object_subset_summ <- profileplyr::summarize(proplyr_object_subset,
fun = rowMeans,
output = "object")
## K27ac_esc K4me1_esc K4me3_esc K27ac_meso K4me1_meso K4me3_meso
## giID1 4.0326237 0.327227 104.73912667 15.517333 0.2341562 122.2847307
## giID10 0.9740337 2.296254 0.35388878 8.081078 3.6760741 0.9602948
## giID100 0.5168534 0.308683 0.09848487 11.528448 2.6178709 0.3257963
## GRanges object with 3 ranges and 5 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## giID1 chr21 34696445-34697205 + | <NA> 0
## giID10 chr21 36207782-36209962 + | <NA> 0
## giID100 chr7 150734773-150737527 + | <NA> 0
## sgGroup giID names
## <factor> <factor> <factor>
## giID1 K27ac_top10_HUES64.bed giID1 <NA>
## giID10 K27ac_top10_HUES64.bed giID10 <NA>
## giID100 K27ac_top10_HUES64.bed giID100 <NA>
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
If an integer is entered for either ‘kmeans_k’ or ‘cutree_rows’, then a profileplyr object will be returned with a cluster assigned to each range, and a column is added to the range metadata with the cluster for each range.
set.seed(0)
proplyr_object_subset <- clusterRanges(proplyr_object_subset,
fun = rowMeans,
kmeans_k = 3)
## K means clustering used.
## A column has been added to the range metadata with the column name 'cluster', and the 'rowGroupsInUse' has been set to this column.
## GRanges object with 3 ranges and 6 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## giID1 chr21 34696445-34697205 + | <NA> 0
## giID10 chr21 36207782-36209962 + | <NA> 0
## giID100 chr7 150734773-150737527 + | <NA> 0
## sgGroup giID names cluster
## <factor> <factor> <factor> <ordered>
## giID1 K27ac_top10_HUES64.bed giID1 <NA> 1
## giID10 K27ac_top10_HUES64.bed giID10 <NA> 3
## giID100 K27ac_top10_HUES64.bed giID100 <NA> 3
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
To visually inspect the clusters, the heatmap can also be printed (though it will not be returned) if the ‘silent’ argument is set to FALSE. A profileplyr object will still be returned even if silent is set to FALSE.
Kmeans clustering:
set.seed(0)
proplyr_object_subset <- clusterRanges(proplyr_object_subset,
fun = rowMeans,
kmeans_k = 3,
silent = FALSE)
## K means clustering used.
## A column has been added to the range metadata with the column name 'cluster', and the 'rowGroupsInUse' has been set to this column.
We can generate a range heatmap with the ‘generateEnrichedHeatmap’ function introduced above. Here we also set ‘include_group_annotation’ to TRUE because the cluster function changes the grouping to the clusters, so the grouping is now interesting to us
generateEnrichedHeatmap(proplyr_object_subset,
include_group_annotation = TRUE,
color_by_sample_group = "chip",
ylim = "chip",
show_heatmap_legend = rep(FALSE, 6)
)
Note that here we are manually setting the max of the y axis and the max scale for the colors to match the same schema we used for the heatmap above
This helps becuase the signal for each type of ChIP is so different (i.e. H3K4me3 signal is much higher in general than H3K4me1), but importantly we keep the scales the same within each type of ChIP
export_deepToolsMat(proplyr_object_subset,
con = "clustered_deeptools_export",
overwrite = TRUE)
#system("plotHeatmap -m clustered_deeptools_export.gz -o clustered_deeptools_export.pdf --yMax 50 10 150 50 10 150 --zMax 100 20 180 100 20 180")
proplyr_object_subset_summ <- profileplyr::summarize(proplyr_object_subset,
fun = rowMeans,
output = "long")
levels(proplyr_object_subset_summ$cluster) <- c("Cluster 3", "Cluster 1", "Cluster 2")
head(proplyr_object_subset_summ)
## cluster combined_ranges Sample Signal
## 1 Cluster 3 chr21_34696445_34697205 K27ac_esc 4.0326237
## 2 Cluster 1 chr21_36207782_36209962 K27ac_esc 0.9740337
## 3 Cluster 1 chr7_150734773_150737527 K27ac_esc 0.5168534
## 4 Cluster 3 chrY_21906023_21907384 K27ac_esc 4.0641114
## 5 Cluster 1 chr7_106415706_106416874 K27ac_esc 0.5345198
## 6 Cluster 2 chr7_22706847_22707593 K27ac_esc 12.8651591
ggplot(proplyr_object_subset_summ, aes(x = Sample, y = log(Signal))) +
geom_boxplot() +
facet_grid(~cluster)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The annotateRanges() function passes the ranges of a profileplyr object to the annotatePeak() function of ChIPseeker, and then integrates the information from this output into the profileplyr object. The annotation of the ranges, including the annotation type (promoter, exon, etc) and closest gene, are then compiled in the range metadata. In addition to the profileplyr object, a TxDb object must be specified with the ‘TxDb’ argument. The TxDb argument can be one of three things:
Note that the default action of this function will not change the ‘rowGroupsInUse’ element of the params slot. However, if changeGroupToAnnotation = TRUE, a newly generated metadata column will be added that combines the inherited grouping of the ranges and the genomic annotation that was determined by ChIPseeker.
This function adds a few columns to the rowRanges metadata with the annotation from ChIPseeker, as we can see below.
## >> preparing features information... 2019-12-13 17:40:03
## >> identifying nearest features... 2019-12-13 17:40:03
## >> calculating distance from peak to TSS... 2019-12-13 17:40:04
## >> assigning genomic annotation... 2019-12-13 17:40:04
## >> adding gene annotation... 2019-12-13 17:40:19
## >> assigning chromosome lengths 2019-12-13 17:40:19
## >> done... 2019-12-13 17:40:19
## GRanges object with 3 ranges and 19 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## giID1 chr21 34696445-34697205 + | <NA> 0
## giID10 chr21 36207782-36209962 + | <NA> 0
## giID100 chr7 150734773-150737527 + | <NA> 0
## sgGroup giID names cluster
## <factor> <factor> <factor> <ordered>
## giID1 K27ac_top10_HUES64.bed giID1 <NA> 1
## giID10 K27ac_top10_HUES64.bed giID10 <NA> 3
## giID100 K27ac_top10_HUES64.bed giID100 <NA> 3
## annotation geneChr geneStart geneEnd
## <character> <integer> <integer> <integer>
## giID1 Promoter (<=1kb) 21 34696734 34732128
## giID10 Promoter (<=1kb) 21 36169710 36206898
## giID100 Exon (uc003wik.4/11194, exon 10 of 16) 7 150745379 150749843
## geneLength geneStrand geneId transcriptId distanceToTSS
## <integer> <integer> <character> <character> <numeric>
## giID1 35395 1 3454 uc011adv.2 0
## giID10 37189 2 861 uc002yul.1 -884
## giID100 4465 1 9311 uc003win.3 -7852
## ENSEMBL SYMBOL
## <character> <character>
## giID1 ENSG00000142166 IFNAR1
## giID10 ENSG00000159216 RUNX1
## giID100 ENSG00000213199 ASIC3
## GENENAME annotation_short
## <character> <ordered>
## giID1 interferon alpha and beta receptor subunit 1 Promoter
## giID10 RUNX family transcription factor 1 Promoter
## giID100 acid sensing ion channel subunit 3 Exon
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
## $perSampleDPParams
## [1] "upstream" "downstream" "body" "bin.size"
## [5] "ref.point" "unscaled.5.prime" "unscaled.3.prime" "sample_labels"
##
## $rowGroupsInUse
## [1] "cluster"
generateEnrichedHeatmap(proplyr_object_subset,
extra_annotation_columns = c("annotation_short"),
color_by_sample_group = "chip",
ylim = "chip",
show_heatmap_legend = rep(FALSE, 6)
)
The GREAT algorithm for annotating genomic ranges with genes in the same regulatory region is also commonly used. profileplyr has a function, annotateRanges_great(), which will add a column to the range metadata showing whether that range is in the regulatory region of a particular gene. Note this is different from ChIPseeker, which will annotate with the closest gene. annotateRagnes_great() may also result in duplication of ranges within the profileplyr object if a range is in the regulatory region of two or more genes.
# remake another subsetted object to demonstrate this function (the ChIPseeker annotated object will be used for chunks other than this one)
proplyr_object_subset_great <- proplyr_object[, , c(-1, -3, -6, -8)]
proplyr_object_subset_great <- annotateRanges_great(proplyr_object_subset_great,
species = "hg19")
## GRanges object with 3 ranges and 7 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## giID808 chr1 1250419-1251035 + | <NA> 0
## giID808 chr1 1250419-1251035 + | <NA> 0
## giID785 chr1 2209362-2211662 + | <NA> 0
## sgGroup giID names SYMBOL distanceToTSS
## <factor> <factor> <factor> <character> <numeric>
## giID808 K27ac_top10_HUES64.bed giID808 <NA> PUSL1 6780
## giID808 K27ac_top10_HUES64.bed giID808 <NA> CPSF3L 9262
## giID785 K27ac_top10_HUES64.bed giID785 <NA> RER1 -112760
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
mcols(proplyr_object_subset)$newColumn <- rnorm(n = nrow(proplyr_object_subset))
params(proplyr_object_subset)$rowGroupsInUse
## [1] "cluster"
switchGroup <- groupBy(proplyr_object_subset,
group = "newColumn")
params(switchGroup)$rowGroupsInUse
## [1] "newColumn"
GATA4_1 <- import.bed("GSM1505644_Gata4_022113_meso.bed.peak.txt.gz")
GATA4_2 <- import.bed("GSM1505645_Gata4_041914_meso.bed.peak.txt.gz")
GATA4_HC <- GATA4_1[GATA4_1 %over% GATA4_2]
seqlevelsStyle(GATA4_HC) <- "UCSC"
When a profileplyr object is grouped by GRanges a column is added to the range metadata titled “GR_overlap_names” that indicates which GRanges that range overlaps with (or an indication that there was no overlap). The ‘rowGroupsInUse’ slot of the params is automatically changed to this column, and this is reflected in the heatmap on the next slide.
proplyr_object_subset <- groupBy(proplyr_object_subset,
group = GATA4_HC,
GRanges_names = "GATA4_peaks",
include_nonoverlapping = TRUE)
table(mcols(proplyr_object_subset)$GR_overlap_names)
##
## GATA4_peaks no_overlap
## 124 876
We then can change the ‘rowGroupsInUse’ slot of the params back to the ‘cluster’ column, and add the ‘GR_overlap_names’ column to the ‘extra_annotation_columns’ argument of generateEnrichedHeatmap() to see how the overlap changes based on our clusters.
proplyr_object_subset <- groupBy(proplyr_object_subset,
group = "cluster")
generateEnrichedHeatmap(proplyr_object_subset,
include_group_annotation = TRUE,
extra_annotation_columns = c("annotation_short",
"GR_overlap_names"),
color_by_sample_group = "chip",
ylim = "chip",
show_heatmap_legend = rep(FALSE, 6)
)
A profileplyr object can also be subset or annotated based on overlap of its gene annotations (from annotateRanges() or annotateRanges_great()) with a list of genes from the user. This argument must be a list(), and the elements of list can either be a character vector of gene symbols or a list of data frames with the gene symbols as the rownames. Additional columns in these data frames will be inculded as columns in the range metadata and can also be used for additional range annotation.
First we will read in two genes lists, one for genes going up in the mesoderm compared to the ESCs, and one for the genes that go down (or up in ESC). This analysis was done with DESeq2 using the following counts tables downloaded from ENCODE: * HUES64 ESC: Replicate 1 and Replicate 2 * Mesoderm: Replicate 1 and Replicate 2
library(dplyr)
UP_in_meso_genes <- read.table("UP_in_meso_genes.txt",
header = FALSE,
stringsAsFactors = FALSE) %>%
.[,1]
head(UP_in_meso_genes)
## [1] "HAPLN1" "COL3A1" "ANXA1" "ITGA1" "LOX" "COL1A1"
DOWN_in_meso_genes <- read.table("DOWN_in_meso_genes.txt",
header = FALSE,
stringsAsFactors = FALSE) %>%
.[,1]
head(UP_in_meso_genes)
## [1] "HAPLN1" "COL3A1" "ANXA1" "ITGA1" "LOX" "COL1A1"
We combine these gene sets into a single list, and again use the groupBy() function to group the ranges. This will create a column in the range metadata shoing these overlaps, and the’rowgroupsInUse’ will by default be switched to this column. Remember that you can always switch this grouping to any column you’d like with the same groupBy() function, as shown above.
# IMPORTANT: The profileplyr object must have been annotated with either annotateRanges() or annotateRanges_great() before subsetting by a gene list
# since we have already performed annotateRanges() on this object, we don't need to do it here
geneList_for_grouping <- list(meso_UP= UP_in_meso_genes,
meso_DOWN = DOWN_in_meso_genes)
proplyr_object_subset <- groupBy(proplyr_object_subset,
group = geneList_for_grouping)
## A column has been added to the range metadata with the column name 'GL_overlap_names' that specifies the GRanges each range overlaps with, but the inherited groups are not included.
## DataFrame with 499 rows and 25 columns
## name score sgGroup giID names
## <character> <numeric> <factor> <factor> <factor>
## giID100 NA 0 K27ac_top10_HUES64.bed giID100 NA
## giID113 NA 0 K27ac_top10_HUES64.bed giID113 NA
## giID131 NA 0 K27ac_top10_HUES64.bed giID131 NA
## giID137 NA 0 K27ac_top10_HUES64.bed giID137 NA
## giID154 NA 0 K27ac_top10_HUES64.bed giID154 NA
## ... ... ... ... ... ...
## giID989 NA 0 K27ac_top10_HUES64.bed giID989 NA
## giID992 NA 0 K27ac_top10_HUES64.bed giID992 NA
## giID994 NA 0 K27ac_top10_HUES64.bed giID994 NA
## giID997 NA 0 K27ac_top10_HUES64.bed giID997 NA
## giID999 NA 0 K27ac_top10_HUES64.bed giID999 NA
## cluster annotation geneChr
## <ordered> <character> <integer>
## giID100 3 Exon (uc003wik.4/11194, exon 10 of 16) 7
## giID113 3 Distal Intergenic 7
## giID131 3 Intron (uc003thn.2/79783, intron 12 of 13) 7
## giID137 1 Promoter (2-3kb) 7
## giID154 3 Intron (uc002ntd.3/22847, intron 4 of 5) 19
## ... ... ... ...
## giID989 1 Promoter (<=1kb) 23
## giID992 2 Distal Intergenic 23
## giID994 1 Promoter (<=1kb) 23
## giID997 1 Promoter (<=1kb) 23
## giID999 2 Distal Intergenic 24
## geneStart geneEnd geneLength geneStrand geneId transcriptId
## <integer> <integer> <integer> <integer> <character> <character>
## giID100 150745379 150749843 4465 1 9311 uc003win.3
## giID113 152456834 152552464 95631 1 57180 uc011kvp.2
## giID131 40833056 40900366 67311 1 79783 uc003thp.2
## giID137 5346423 5463177 116755 2 84629 uc003soi.4
## giID154 32836514 32878573 42060 1 22847 uc002nte.3
## ... ... ... ... ... ... ...
## giID989 100805514 100809675 4162 1 51309 uc004ehv.3
## giID992 112018105 112084043 65939 2 154796 uc004eps.3
## giID994 54556644 54593720 37077 1 54552 uc022bxi.1
## giID997 84498997 84528368 29372 1 7552 uc004eeo.3
## giID999 15815447 15817902 2456 1 9087 uc004ftb.3
## distanceToTSS ENSEMBL SYMBOL
## <numeric> <character> <character>
## giID100 -7852 ENSG00000213199 ASIC3
## giID113 228976 ENSG00000133627 ACTR3B
## giID131 -58381 ENSG00000175600 SUGCT
## giID137 2180 ENSG00000182095 TNRC18
## giID154 26757 ENSG00000168813 ZNF507
## ... ... ... ...
## giID989 0 ENSG00000126947 ARMCX1
## giID992 -558621 ENSG00000126016 AMOT
## giID994 43 ENSG00000130119 GNL3L
## giID997 0 ENSG00000147180 ZNF711
## giID999 97458 ENSG00000154620 TMSB4Y
## GENENAME annotation_short
## <character> <factor>
## giID100 acid sensing ion channel subunit 3 Exon
## giID113 actin related protein 3B Distal Intergenic
## giID131 succinyl-CoA:glutarate-CoA transferase Intron
## giID137 trinucleotide repeat containing 18 Promoter
## giID154 zinc finger protein 507 Intron
## ... ... ...
## giID989 armadillo repeat containing X-linked 1 Promoter
## giID992 angiomotin Distal Intergenic
## giID994 G protein nucleolar 3 like Promoter
## giID997 zinc finger protein 711 Promoter
## giID999 thymosin beta 4 Y-linked Distal Intergenic
## newColumn overlap_matrix GR_overlap_names name.1
## <numeric> <matrix> <factor> <character>
## giID100 0.414641434456408 1:0 GATA4_peaks 26846
## giID113 -1.23753842192996 0:1 GATA4_peaks 26884
## giID131 -0.279346281854269 0:1 GATA4_peaks 25916
## giID137 -1.0655905803883 0:1 GATA4_peaks 25547
## giID154 -0.814968708869917 0:1 GATA4_peaks 14085
## ... ... ... ... ...
## giID989 0.95706365807995 0:1 no_overlap NA
## giID992 0.0863349346465467 1:0 no_overlap NA
## giID994 -0.969918022860184 0:1 no_overlap NA
## giID997 -0.703252294210806 0:1 no_overlap NA
## giID999 1.84110689326317 0:1 no_overlap NA
## score.1 GL_overlap_names
## <numeric> <ordered>
## giID100 94.6 meso_UP
## giID113 143.25 meso_DOWN
## giID131 319.84 meso_DOWN
## giID137 255.95 meso_DOWN
## giID154 50.31 meso_DOWN
## ... ... ...
## giID989 NA meso_DOWN
## giID992 NA meso_UP
## giID994 NA meso_DOWN
## giID997 NA meso_DOWN
## giID999 NA meso_DOWN
generateEnrichedHeatmap(proplyr_object_subset,
include_group_annotation = TRUE,
extra_annotation_columns = c("annotation_short"),
color_by_sample_group = "chip",
ylim = "chip",
show_heatmap_legend = rep(FALSE, 6)
)
The profileplyr object can always be exported to deepTools, however, remember that while the grouping will be preserved in deepTools, the extra annotation columns will not.
## [1] "groupBy_geneList_deeptools"
The following table is simply a DESeq2 results table subset for gene symbol and the stat column indicating direction and extent of gene expression changes. By using a data frame with this column, we can then use this information as an extra annotation column in our range heatmap.
## meso_vs_esc_stat
## HAPLN1 44.41412
## COL3A1 43.80409
## ANXA1 40.33583
## ITGA1 35.59385
## LOX 35.55025
## COL1A1 35.05454
After this grouping, a column will be added to the range metadata called “GL_overlap_names”, in addition to the other columns in the user-supplied data frame.
note that in addition to the ‘GL_overlap_names’ column showing overlap (which isn’t important since we used all genes here), there is a column showing the expression changes for all these genes in meso vs esc
proplyr_object_subset <- groupBy(proplyr_object_subset,
group = list(meso_vs_esc_stat), # note that this must be a list, and you could have more than one data frame here
include_nonoverlapping = FALSE)
#since group will be changed automatically to the overlap with the gene lists, we again can chang back to clusters
proplyr_object_subset <- groupBy(proplyr_object_subset,
group = "cluster")
tail(colnames(mcols(proplyr_object_subset)))
## [1] "GR_overlap_names" "name.1" "score.1"
## [4] "GL_overlap_names" "GL_overlap_names.1" "meso_vs_esc_stat"
Here we use the gene expression information to generate a heatmap with the expression of genes in mesoderm vs HUES64 cells for each cluster
You’ll notice in the previous figure that the color scale for the expression data is unbalanced so that 0 is not right in the middle, resulting in most ranges aving a red color. And we want a different color for genes that go up and another color for genes that go down, with genes that don’t change being white. The colors of the extra annotation columns can be changed with the ‘extra_anno_color’ argument, which takes a list containing elements taht correspond to each extra annotation heatmap on the right side of the figure.
The list must be the same length as the number of extra columns, and each element can either be NULL (keep default coloring), a character or numeric vector with two or three colors depending on whether you want a two or three color progression, or a function that calls colorRamp(). Below we will generate a function to make a three color scheme, with 0 in the being white. We also make the max and min values the 98th and 2nd percentiles to avoid the extreme values driving the colors.
library(circlize)
meso_vs_esc_statonly <- mcols(proplyr_object_subset)$meso_vs_esc_stat
q_98 <- quantile(meso_vs_esc_statonly, 0.98)
q_01 <- quantile(meso_vs_esc_statonly, 0.02)
meso_vs_esc_stat_color <- (colorRamp2(c(q_01, 0, q_98),
c("blue", "white", "red")))
This function will be used for the gene expression column, and we will change the colors of the GATA4 overlap column as well so that no overlap is white.
generateEnrichedHeatmap(proplyr_object_subset,
include_group_annotation = TRUE,
extra_annotation_columns = c("annotation_short",
"GR_overlap_names",
"meso_vs_esc_stat"),
color_by_sample_group = "chip",
ylim = "chip",
show_heatmap_legend = rep(FALSE, 6),
extra_anno_color = list(NULL, c("red", "white"), meso_vs_esc_stat_color)
)
# make and subset profileplyr object from bam files
proplyr_object_pipe <- readRDS("all_bigwigs_HUES64_meso_esc_only_top10perc_1Ksample.RData") %>%
.[, , c(-1, -3, -6, -8)] %>%
annotateRanges(TxDb = "hg19") %>%
clusterRanges(fun = rowMeans,
kmeans_k = 3,
silent = TRUE) %>%
groupBy(group = GATA4_HC,
GRanges_names = "GATA4_peaks",
include_nonoverlapping = TRUE) %>%
groupBy(group = list(meso_vs_esc_stat),
include_nonoverlapping = FALSE) %>%
groupBy(group = "cluster")
## >> preparing features information... 2019-12-13 17:42:15
## >> identifying nearest features... 2019-12-13 17:42:15
## >> calculating distance from peak to TSS... 2019-12-13 17:42:15
## >> assigning genomic annotation... 2019-12-13 17:42:15
## >> adding gene annotation... 2019-12-13 17:42:17
## >> assigning chromosome lengths 2019-12-13 17:42:17
## >> done... 2019-12-13 17:42:17
# make the modifications to the object not amenable to pipe
row.names(sampleData(proplyr_object_pipe)) <- c("K27ac_esc", "K4me1_esc", "K4me3_esc",
"K27ac_meso", "K4me1_meso", "K4me3_meso")
sampleData(proplyr_object_pipe)$tissue <- c(rep("esc", 3), rep("meso", 3))
sampleData(proplyr_object_pipe)$chip <- c(rep(c("K27ac", "K4me1", "K4me3"), 2))
library(dplyr)
generateEnrichedHeatmap(proplyr_object_pipe,
include_group_annotation = TRUE,
extra_annotation_columns = c("annotation_short",
"GR_overlap_names",
"meso_vs_esc_stat"),
color_by_sample_group = "chip",
ylim = "chip",
show_heatmap_legend = rep(FALSE, 6),
extra_anno_color = list(NULL, c("red", "white"), meso_vs_esc_stat_color)
)