library(ggplot2)
library(GenomicRanges)
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, aperm,
## append, as.data.frame, basename, cbind, colnames, dirname,
## do.call, duplicated, eval, evalq, get, grep, grepl, intersect,
## is.unsorted, lapply, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, rank, rbind, rownames, sapply,
## saveRDS, setdiff, table, tapply, union, unique, unsplit,
## which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## I, expand.grid, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library(InteractionSet)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs,
## rowVars, rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(HiCExperiment)
## Consider using the `HiContacts` package to perform advanced genomic operations
## on `HiCExperiment` objects.
##
## Read "Orchestrating Hi-C analysis with Bioconductor" online book to learn more:
## https://js2264.github.io/OHCA/
##
## Attaching package: 'HiCExperiment'
## The following object is masked from 'package:SummarizedExperiment':
##
## metadata<-
## The following object is masked from 'package:S4Vectors':
##
## metadata<-
## The following object is masked from 'package:ggplot2':
##
## resolution
library(HiContactsData)
## Loading required package: ExperimentHub
## Loading required package: AnnotationHub
## Loading required package: BiocFileCache
## Loading required package: dbplyr
##
## Attaching package: 'AnnotationHub'
## The following object is masked from 'package:Biobase':
##
## cache
3 Manipulating Hi-C data in R
This chapter focuses on:
- Modifying information associated with an existing
HiCExperiment
object - Subsetting a
HiCExperiment
object - Coercing a
HiCExperiment
object in a base data structure
- An
HiCExperiment
object allows random access parsing of a disk-stored contact matrix. - An
HiCExperiment
object operates by wrapping together (1) aContactFile
(i.e. a connection to a disk-stored data file) and (2) aGInteractions
generated by parsing the data file.
HiCExperiment
objects π
- Creating a connection to a disk-stored contact matrix:
# coolf <- "<path-to-disk-stored-contact-matrix.cool>"
coolf <- HiContactsData('yeast_wt', 'mcool')
## see ?HiContactsData and browseVignettes('HiContactsData') for documentation
## loading from cache
cf <- CoolFile(coolf)
availableResolutions(cf)
## resolutions(5): 1000 2000 4000 8000 16000
##
availableChromosomes(cf)
## Seqinfo object with 16 sequences from an unspecified genome:
## seqnames seqlengths isCircular genome
## I 230218 <NA> <NA>
## II 813184 <NA> <NA>
## III 316620 <NA> <NA>
## IV 1531933 <NA> <NA>
## V 576874 <NA> <NA>
## ... ... ... ...
## XII 1078177 <NA> <NA>
## XIII 924431 <NA> <NA>
## XIV 784333 <NA> <NA>
## XV 1091291 <NA> <NA>
## XVI 948066 <NA> <NA>
- Importing a contact matrix over a specific genomic location, at a given resolution:
hic <- import(cf, focus = 'II:10000-50000', resolution = 4000)
hic
## `HiCExperiment` object with 10,801 contacts over 11 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:10,000-50,000"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 4000
## interactions: 45
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Recovering genomic interactions stored in a
HiCExperiment
:
interactions(hic)
## GInteractions object with 45 interactions and 4 metadata columns:
## seqnames1 ranges1 seqnames2 ranges2 | bin_id1 bin_id2
## <Rle> <IRanges> <Rle> <IRanges> | <numeric> <numeric>
## [1] II 12001-16000 --- II 12001-16000 | 61 61
## [2] II 12001-16000 --- II 16001-20000 | 61 62
## [3] II 12001-16000 --- II 20001-24000 | 61 63
## [4] II 12001-16000 --- II 24001-28000 | 61 64
## [5] II 12001-16000 --- II 28001-32000 | 61 65
## ... ... ... ... ... ... . ... ...
## [41] II 36001-40000 --- II 40001-44000 | 67 68
## [42] II 36001-40000 --- II 44001-48000 | 67 69
## [43] II 40001-44000 --- II 40001-44000 | 68 68
## [44] II 40001-44000 --- II 44001-48000 | 68 69
## [45] II 44001-48000 --- II 44001-48000 | 69 69
## count balanced
## <numeric> <numeric>
## [1] 213 0.249303
## [2] 673 0.449271
## [3] 325 0.210001
## [4] 137 0.125732
## [5] 77 0.106917
## ... ... ...
## [41] 941 0.358860
## [42] 275 0.114972
## [43] 675 0.253868
## [44] 497 0.204920
## [45] 295 0.133344
## -------
## regions: 11 ranges and 4 metadata columns
## seqinfo: 16 sequences from an unspecified genome
hic
object π
To demonstrate how to manipulate a HiCExperiment
object, we will create an HiCExperiment
object from an example .cool
file provided in the HiContactsData
package.
library(HiCExperiment)
library(HiContactsData)
# ---- This downloads an example `.mcool` file and caches it locally
coolf <- HiContactsData('yeast_wt', 'mcool')
## see ?HiContactsData and browseVignettes('HiContactsData') for documentation
## loading from cache
# ---- This creates a connection to the disk-stored `.mcool` file
cf <- CoolFile(coolf)
cf
## CoolFile object
## .mcool file: /home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752
## resolution: 1000
## pairs file:
## metadata(0):
# ---- This imports contacts from the long arm of chromosome `II`, at resolution `2000`
hic <- import(cf, focus = 'II:300001-813184', resolution = 2000)
hic
## `HiCExperiment` object with 306,212 contacts over 257 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300,001-813,184"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 18513
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
3.1 Subsetting a contact matrix
Two entirely different approaches are possible to subset of a Hi-C contact matrix:
Subsetting before importing: leveraging random access to a disk-stored contact matrix to only import interactions overlapping with a genomic locus of interest.
Subsetting after importing: parsing the entire contact matrix in memory, and subsequently subset interactions overlapping with a genomic locus of interest.
3.1.1 Subsetting before import: with focus
Specifying a focus
when importing a dataset in R (i.e. "Subset first, then parse"
) is generally the recommended approach to import Hi-C data in R.
The focus
argument can be set when import
ing a ContactFile
in R, as follows:
## This code is not evaluated
## Change `focus = "..."` accordingly (see below)
import(cf, focus = "...")
This ensures that only the needed data is parsed in R, reducing memory load and accelerating the import. Thus, this should be the preferred way of parsing HiCExperiment
data, as disk-stored contact matrices allow efficient random access to indexed data.
focus
can be any of the following string types:
# "II" --> import contacts over an entire chromosome
# "II:300001-800000" --> import on-diagonal contacts within a chromosome
# "II:300001-400000|II:600001-700000" --> import off-diagonal contacts within a chromosome
# "II|III" --> import contacts between two chromosomes
# "II:300001-800000|V:1-500000" --> import contacts between segments of two chromosomes
focus
argument π
- Subsetting to a specific on-diagonal genomic location using standard UCSC coordinates query:
import(cf, focus = 'II:300001-800000', resolution = 2000)
## `HiCExperiment` object with 301,018 contacts over 250 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300,001-800,000"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 17974
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Subsetting to a specific off-diagonal genomic location using pairs of coordinates query:
import(cf, focus = 'II:300001-400000|II:600001-700000', resolution = 2000)
## `HiCExperiment` object with 402 contacts over 100 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300001-400000|II:600001-700000"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 357
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Subsetting interactions to retain those constrained within a single chromosome:
import(cf, focus = 'II', resolution = 2000)
## `HiCExperiment` object with 471,364 contacts over 407 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 34063
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Subsetting interactions to retain those between two chromosomes:
import(cf, focus = 'II|III', resolution = 2000)
## `HiCExperiment` object with 9,092 contacts over 566 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II|III"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 7438
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Subsetting interactions to retain those between parts of two chromosomes:
import(cf, focus = 'II:300001-800000|V:1-500000', resolution = 2000)
## `HiCExperiment` object with 7,147 contacts over 500 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300001-800000|V:1-500000"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 6523
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
3.1.2 Subsetting after import
It may sometimes be desirable to import a full dataset from disk first, and only then perform in-memory subsetting of the HiCExperiment
object (i.e. "Parse first, then subset"
). This is for example necessary when the end user aims to investigate subsets of interactions across a large number of different areas of a contact matrix.
Several strategies are possible to allow subsetting of imported data, either with subsetByOverlaps
or [
.
3.1.2.1 subsetByOverlaps(<HiCExperiment>, <GRanges>)
subsetByOverlaps
can take a HiCExperiment
as a query
and a GRanges
as a query. In this case, the GRanges
is used to extract a subset of a HiCExperiment
constrained within a specific genomic location.
telomere <- GRanges("II:700001-813184")
subsetByOverlaps(hic, telomere) |> interactions()
## GInteractions object with 1540 interactions and 4 metadata columns:
## seqnames1 ranges1 seqnames2 ranges2 | bin_id1
## <Rle> <IRanges> <Rle> <IRanges> | <numeric>
## [1] II 700001-702000 --- II 700001-702000 | 466
## [2] II 700001-702000 --- II 702001-704000 | 466
## [3] II 700001-702000 --- II 704001-706000 | 466
## [4] II 700001-702000 --- II 706001-708000 | 466
## [5] II 700001-702000 --- II 708001-710000 | 466
## ... ... ... ... ... ... . ...
## [1536] II 804001-806000 --- II 810001-812000 | 518
## [1537] II 806001-808000 --- II 806001-808000 | 519
## [1538] II 806001-808000 --- II 808001-810000 | 519
## [1539] II 806001-808000 --- II 810001-812000 | 519
## [1540] II 808001-810000 --- II 808001-810000 | 520
## bin_id2 count balanced
## <numeric> <numeric> <numeric>
## [1] 466 30 0.0283618
## [2] 467 145 0.0709380
## [3] 468 124 0.0704979
## [4] 469 59 0.0510221
## [5] 470 59 0.0384004
## ... ... ... ...
## [1536] 521 1 NaN
## [1537] 519 15 0.0560633
## [1538] 520 25 NaN
## [1539] 521 1 NaN
## [1540] 520 10 NaN
## -------
## regions: 57 ranges and 4 metadata columns
## seqinfo: 16 sequences from an unspecified genome
By default, subsetByOverlaps(hic, telomere)
will only recover interactions constrained within telomere
, i.e. interactions for which both ends are in telomere
.
Alternatively, type = "any"
can be specified to get all interactions with at least one of their anchors within telomere
.
subsetByOverlaps(hic, telomere, type = "any") |> interactions()
## GInteractions object with 6041 interactions and 4 metadata columns:
## seqnames1 ranges1 seqnames2 ranges2 | bin_id1
## <Rle> <IRanges> <Rle> <IRanges> | <numeric>
## [1] II 300001-302000 --- II 702001-704000 | 266
## [2] II 300001-302000 --- II 704001-706000 | 266
## [3] II 300001-302000 --- II 768001-770000 | 266
## [4] II 300001-302000 --- II 784001-786000 | 266
## [5] II 302001-304000 --- II 740001-742000 | 267
## ... ... ... ... ... ... . ...
## [6037] II 804001-806000 --- II 810001-812000 | 518
## [6038] II 806001-808000 --- II 806001-808000 | 519
## [6039] II 806001-808000 --- II 808001-810000 | 519
## [6040] II 806001-808000 --- II 810001-812000 | 519
## [6041] II 808001-810000 --- II 808001-810000 | 520
## bin_id2 count balanced
## <numeric> <numeric> <numeric>
## [1] 467 1 0.000590999
## [2] 468 1 0.000686799
## [3] 500 1 0.000728215
## [4] 508 1 0.000923092
## [5] 486 1 0.000382222
## ... ... ... ...
## [6037] 521 1 NaN
## [6038] 519 15 0.0560633
## [6039] 520 25 NaN
## [6040] 521 1 NaN
## [6041] 520 10 NaN
## -------
## regions: 257 ranges and 4 metadata columns
## seqinfo: 16 sequences from an unspecified genome
3.1.2.2 <HiCExperiment>["..."]
The square bracket operator [
allows for more advanced textual queries, similarly to focus
arguments that can be used when importing contact matrices in memory.
This ensures that only the needed data is parsed in R, reducing memory load and accelerating the import. Thus, this should be the preferred way of parsing HiCExperiment
data, as disk-stored contact matrices allow efficient random access to indexed data.
The following string types can be used to subset a HiCExperiment
object with the [
notation:
# "II" --> import contacts over an entire chromosome
# "II:300001-800000" --> import on-diagonal contacts within a chromosome
# "II:300001-400000|II:600001-700000" --> import off-diagonal contacts within a chromosome
# "II|III" --> import contacts between two chromosomes
# "II:300001-800000|V:1-500000" --> import contacts between segments of two chromosomes
# c("II", "III", "IV") --> import contacts within and between several chromosomes
[
π
- Subsetting to a specific on-diagonal genomic location using standard UCSC coordinates query:
hic["II:800001-813184"]
## `HiCExperiment` object with 1,040 contacts over 6 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:800,001-813,184"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 19
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Subsetting to a specific off-diagonal genomic location using pairs of coordinates query:
hic["II:300001-320000|II:800001-813184"]
## `HiCExperiment` object with 3 contacts over 6 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300001-320000|II:800001-813184"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 3
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Subsetting interactions to retain those constrained within a single chromosome:
hic["II"]
## `HiCExperiment` object with 306,212 contacts over 257 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 18513
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Subsetting interactions to retain those between two chromosomes:
hic["II|IV"]
## `HiCExperiment` object with 0 contacts over 0 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:1-813184|IV:1-1531933"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 0
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Subsetting interactions to retain those between segments of two chromosomes:
hic["II:300001-320000|IV:1-100000"]
## `HiCExperiment` object with 0 contacts over 0 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300001-320000|IV:1-100000"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 0
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
- Subsetting interactions to retain those constrained within several chromosomes:
hic[c('II', 'III', 'IV')]
## `HiCExperiment` object with 306,212 contacts over 257 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II, III, IV"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 18513
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
Some notes:
- This last example (subsetting for a vector of several chromosomes) is the only scenario for which
[
-based in-memory subsetting of pre-imported data is the only way to go, as such subsetting is not possible withfocus
from disk-stored data. - All the other
[
subsetting scenarii illustrated above can be achieved more efficiently using thefocus
argument whenimport
ing data into aHiCExperiment
object. - However, keep in mind that subsetting preserves extra data, e.g. added
scores
,topologicalFeatures
,metadata
orpairsFile
, whereas this information is lost usingfocus
withimport
.
3.1.3 Zooming on a HiCExperiment
βZoomingβ refers to dynamically changing the resolution of a HiCExperiment
. By zoom
ing a HiCExperiment
, one can refine or coarsen the contact matrix. This operation takes aContactFile
and focus
from an existing HiCExperiment
input and re-generates a new HiCExperiment
with updated resolution
, interactions
and scores
. Note that zoom
will preserve existing metadata
, topologicalFeatures
and pairsFile
information.
hic
## `HiCExperiment` object with 306,212 contacts over 257 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300,001-813,184"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 18513
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
zoom(hic, 4000)
## `HiCExperiment` object with 306,212 contacts over 129 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300,001-813,184"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 4000
## interactions: 6800
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
zoom(hic, 1000)
## `HiCExperiment` object with 306,212 contacts over 514 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300,001-813,184"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 1000
## interactions: 44363
## scores(2): count balanced
## topologicalFeatures: compartments(0) borders(0) loops(0) viewpoints(0)
## pairsFile: N/A
## metadata(0):
The sum of raw counts do not change after zoom
ing, however the number of individual interactions
and regions
changes.
-
zoom
does not change thefocus
! It only affects theresolution
(and consequently, theinteractions
). -
zoom
will only work for multi-resolution contact matrices, e.g..mcool
or.hic
.
3.2 Updating an HiCExperiment
object
HiCExperiment
slots are mutable (β
) / immutable (βοΈ)?
-
fileName(hic)
: βοΈ (obtained from disk-stored file) -
focus(hic)
: π€ (see subsetting section) -
resolutions(hic)
: βοΈ (obtained from disk-stored file) -
resolution(hic)
: π€ (see zooming section) -
interactions(hic)
: βοΈ (obtained from disk-stored file) -
scores(hic)
: β -
topologicalFeatures(hic)
: β -
pairsFile(hic)
: β -
metadata(hic)
: β
3.2.1 Immutable slots
An HiCExperiment
object acts as an interface exposing disk-stored data. This implies that the fileName
slot itself is immutable (i.e. cannot be changed). This should be obvious, as a HiCExperiment
has to be associated with a disk-stored contact matrix to properly function (except in some advanced cases developed in next chapters).
For this reason, methods to manually modify interactions
and resolutions
slots are also not exposed in the HiCExperiment
package.
A corollary of this is that the associated regions
and anchors
of an HiCExperiment
should not be modified by hand either, since they are directly linked to interactions
.
3.2.2 Mutable slots
That being said, HiCExperiment
objects are flexible and can be partially modified in memory without having to change/overwrite the original, disk-stored contact matrix.
Several slots
can be modified in memory: slots
, topologicalFeatures
, pairsFile
and metadata
.
3.2.2.1 scores
We have seen in the previous chapter that scores are stored in a list
and are available using the scores
function.
Extra scores can be added to this list, e.g. to describe the βexpectedβ interaction frequency for each interaction stored in the HiCExperiment
object). This can be achieved using the scores()<-
function.
3.2.2.2 topologicalFeatures
The end-user can create additional topologicalFeatures
or modify the existing ones using the topologicalFeatures()<-
function.
topologicalFeatures(hic, 'CTCF') <- GRanges(c(
"II:340-352",
"II:3520-3532",
"II:7980-7992",
"II:9240-9252"
))
topologicalFeatures(hic, 'CTCF')
## GRanges object with 4 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] II 340-352 *
## [2] II 3520-3532 *
## [3] II 7980-7992 *
## [4] II 9240-9252 *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
topologicalFeatures(hic, 'loops') <- GInteractions(
topologicalFeatures(hic, 'CTCF')[rep(1:3, each = 3)],
topologicalFeatures(hic, 'CTCF')[rep(1:3, 3)]
)
topologicalFeatures(hic, 'loops')
## GInteractions object with 9 interactions and 0 metadata columns:
## seqnames1 ranges1 seqnames2 ranges2
## <Rle> <IRanges> <Rle> <IRanges>
## [1] II 340-352 --- II 340-352
## [2] II 340-352 --- II 3520-3532
## [3] II 340-352 --- II 7980-7992
## [4] II 3520-3532 --- II 340-352
## [5] II 3520-3532 --- II 3520-3532
## [6] II 3520-3532 --- II 7980-7992
## [7] II 7980-7992 --- II 340-352
## [8] II 7980-7992 --- II 3520-3532
## [9] II 7980-7992 --- II 7980-7992
## -------
## regions: 3 ranges and 0 metadata columns
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
hic
## `HiCExperiment` object with 306,212 contacts over 257 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300,001-813,184"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 18513
## scores(3): count balanced random
## topologicalFeatures: compartments(0) borders(0) loops(9) viewpoints(0) CTCF(4)
## pairsFile: N/A
## metadata(0):
All these objects can be used in *Overlap
methods, as they all extend the GRanges
class of objects.
# ---- This counts the number of times `CTCF` anchors are being used in the
# `loops` `GInteractions` object
countOverlaps(
query = topologicalFeatures(hic, 'CTCF'),
subject = topologicalFeatures(hic, 'loops')
)
## [1] 5 5 5 0
3.2.2.3 pairsFile
If pairsFile
is not specified when importing the ContactFile
into a HiCExperiment
object, one can add it later.
pairsf <- HiContactsData('yeast_wt', 'pairs.gz')
## see ?HiContactsData and browseVignettes('HiContactsData') for documentation
## loading from cache
pairsFile(hic) <- pairsf
hic
## `HiCExperiment` object with 306,212 contacts over 257 regions
## -------
## fileName: "/home/biocbuild/.cache/R/ExperimentHub/1895c561583f2e_7752"
## focus: "II:300,001-813,184"
## resolutions(5): 1000 2000 4000 8000 16000
## active resolution: 2000
## interactions: 18513
## scores(3): count balanced random
## topologicalFeatures: compartments(0) borders(0) loops(9) viewpoints(0) CTCF(4)
## pairsFile: /home/biocbuild/.cache/R/ExperimentHub/1895c54cd82178_7753
## metadata(0):
3.2.2.4 metadata
Metadata associated with a HiCExperiment
can be updated at any point.
3.3 Coercing HiCExperiment
objects
Convenient coercing functions exist to transform data stored as a HiCExperiment
into another class.
-
as.matrix()
: allows to coerce theHiCExperiment
into asparse
ordense
matrix (using thesparse
logical argument,TRUE
by default) and choosing specificscores
of interest (using theuse.scores
argument,"balanced"
by default).
# ----- `as.matrix` coerces a `HiCExperiment` into a dense `matrix` by default
as.matrix(hic) |> class()
## [1] "matrix" "array"
as.matrix(hic) |> dim()
## [1] 257 257
# ----- One can specify which scores should be used when coercing into a matrix
as.matrix(hic, use.scores = "balanced")[1:5, 1:5]
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.009657438 0.07662234 0.05410199 0.04294051 0.04090521
## [2,] 0.076622340 0.05128277 0.09841564 0.06926737 0.05263611
## [3,] 0.054101992 0.09841564 0.05657589 0.08723160 0.07316890
## [4,] 0.042940512 0.06926737 0.08723160 0.03699543 0.08403496
## [5,] 0.040905212 0.05263611 0.07316890 0.08403496 0.04787415
as.matrix(hic, use.scores = "count")[1:5, 1:5]
## [,1] [,2] [,3] [,4] [,5]
## [1,] 7 92 75 61 38
## [2,] 92 102 226 163 81
## [3,] 75 226 150 237 130
## [4,] 61 163 237 103 153
## [5,] 38 81 130 153 57
# ----- If needed, one can coerce a HiCExperiment into a sparse matrix
as.matrix(hic, use.scores = "count", sparse = TRUE)[1:5, 1:5]
## 5 x 5 sparse Matrix of class "dgTMatrix"
##
## [1,] 7 92 75 61 38
## [2,] 92 102 226 163 81
## [3,] 75 226 150 237 130
## [4,] 61 163 237 103 153
## [5,] 38 81 130 153 57
-
as.data.frame()
: simply coercinginteractions
into a rectangular data frame
as.data.frame(hic) |> head()
## seqnames1 start1 end1 width1 strand1 bin_id1 weight1 center1
## 1 II 300001 302000 2000 * 266 0.03714342 301000
## 2 II 300001 302000 2000 * 266 0.03714342 301000
## 3 II 300001 302000 2000 * 266 0.03714342 301000
## 4 II 300001 302000 2000 * 266 0.03714342 301000
## 5 II 300001 302000 2000 * 266 0.03714342 301000
## 6 II 300001 302000 2000 * 266 0.03714342 301000
## seqnames2 start2 end2 width2 strand2 bin_id2 weight2 center2 count
## 1 II 300001 302000 2000 * 266 0.03714342 301000 7
## 2 II 302001 304000 2000 * 267 0.02242258 303000 92
## 3 II 304001 306000 2000 * 268 0.01942093 305000 75
## 4 II 306001 308000 2000 * 269 0.01895202 307000 61
## 5 II 308001 310000 2000 * 270 0.02898098 309000 38
## 6 II 310001 312000 2000 * 271 0.01834118 311000 43
## balanced random
## 1 0.009657438 0.5337919
## 2 0.076622340 0.3499953
## 3 0.054101992 0.6272530
## 4 0.042940512 0.8999565
## 5 0.040905212 0.4940897
## 6 0.029293930 0.4214349
These coercing methods only operate on interactions
and scores
, and discard all other information, e.g. regarding genomic regions
, available resolutions
, associated metadata
, pairsFile
or topologicalFeatures
.
Session info
sessioninfo::session_info(include_base = TRUE)
## β Session info ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
## setting value
## version R version 4.4.1 (2024-06-14)
## os Ubuntu 24.04.1 LTS
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate C
## ctype en_US.UTF-8
## tz America/New_York
## date 2024-10-30
## pandoc 2.7.3 @ /usr/bin/ (via rmarkdown)
##
## β Packages ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
## package * version date (UTC) lib source
## abind 1.4-8 2024-09-12 [2] CRAN (R 4.4.1)
## AnnotationDbi 1.68.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## AnnotationHub * 3.14.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## base * 4.4.1 2024-09-25 [3] local
## Biobase * 2.66.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## BiocFileCache * 2.14.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## BiocGenerics * 0.52.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## BiocIO 1.16.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## BiocManager 1.30.25 2024-08-28 [2] CRAN (R 4.4.1)
## BiocParallel 1.40.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## BiocVersion 3.20.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## Biostrings 2.74.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## bit 4.5.0 2024-09-20 [2] CRAN (R 4.4.1)
## bit64 4.5.2 2024-09-22 [2] CRAN (R 4.4.1)
## blob 1.2.4 2023-03-17 [2] CRAN (R 4.4.1)
## cachem 1.1.0 2024-05-16 [2] CRAN (R 4.4.1)
## cli 3.6.3 2024-06-21 [2] CRAN (R 4.4.1)
## codetools 0.2-20 2024-03-31 [3] CRAN (R 4.4.1)
## colorspace 2.1-1 2024-07-26 [2] CRAN (R 4.4.1)
## compiler 4.4.1 2024-09-25 [3] local
## crayon 1.5.3 2024-06-20 [2] CRAN (R 4.4.1)
## curl 5.2.3 2024-09-20 [2] CRAN (R 4.4.1)
## datasets * 4.4.1 2024-09-25 [3] local
## DBI 1.2.3 2024-06-02 [2] CRAN (R 4.4.1)
## dbplyr * 2.5.0 2024-03-19 [2] CRAN (R 4.4.1)
## DelayedArray 0.32.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## digest 0.6.37 2024-08-19 [2] CRAN (R 4.4.1)
## dplyr 1.1.4 2023-11-17 [2] CRAN (R 4.4.1)
## evaluate 1.0.1 2024-10-10 [2] CRAN (R 4.4.1)
## ExperimentHub * 2.14.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## fansi 1.0.6 2023-12-08 [2] CRAN (R 4.4.1)
## fastmap 1.2.0 2024-05-15 [2] CRAN (R 4.4.1)
## filelock 1.0.3 2023-12-11 [2] CRAN (R 4.4.1)
## generics 0.1.3 2022-07-05 [2] CRAN (R 4.4.1)
## GenomeInfoDb * 1.42.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## GenomeInfoDbData 1.2.13 2024-10-01 [2] Bioconductor
## GenomicRanges * 1.58.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## ggplot2 * 3.5.1 2024-04-23 [2] CRAN (R 4.4.1)
## glue 1.8.0 2024-09-30 [2] CRAN (R 4.4.1)
## graphics * 4.4.1 2024-09-25 [3] local
## grDevices * 4.4.1 2024-09-25 [3] local
## grid 4.4.1 2024-09-25 [3] local
## gtable 0.3.6 2024-10-25 [2] CRAN (R 4.4.1)
## HiCExperiment * 1.6.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## HiContactsData * 1.7.0 2024-10-26 [2] Bioconductor 3.20 (R 4.4.1)
## htmltools 0.5.8.1 2024-04-04 [2] CRAN (R 4.4.1)
## htmlwidgets 1.6.4 2023-12-06 [2] CRAN (R 4.4.1)
## httr 1.4.7 2023-08-15 [2] CRAN (R 4.4.1)
## InteractionSet * 1.34.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## IRanges * 2.40.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## jsonlite 1.8.9 2024-09-20 [2] CRAN (R 4.4.1)
## KEGGREST 1.46.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## knitr 1.48 2024-07-07 [2] CRAN (R 4.4.1)
## lattice 0.22-6 2024-03-20 [3] CRAN (R 4.4.1)
## lifecycle 1.0.4 2023-11-07 [2] CRAN (R 4.4.1)
## magrittr 2.0.3 2022-03-30 [2] CRAN (R 4.4.1)
## Matrix 1.7-1 2024-10-18 [3] CRAN (R 4.4.1)
## MatrixGenerics * 1.18.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## matrixStats * 1.4.1 2024-09-08 [2] CRAN (R 4.4.1)
## memoise 2.0.1 2021-11-26 [2] CRAN (R 4.4.1)
## methods * 4.4.1 2024-09-25 [3] local
## mime 0.12 2021-09-28 [2] CRAN (R 4.4.1)
## munsell 0.5.1 2024-04-01 [2] CRAN (R 4.4.1)
## parallel 4.4.1 2024-09-25 [3] local
## pillar 1.9.0 2023-03-22 [2] CRAN (R 4.4.1)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.4.1)
## png 0.1-8 2022-11-29 [2] CRAN (R 4.4.1)
## purrr 1.0.2 2023-08-10 [2] CRAN (R 4.4.1)
## R6 2.5.1 2021-08-19 [2] CRAN (R 4.4.1)
## rappdirs 0.3.3 2021-01-31 [2] CRAN (R 4.4.1)
## Rcpp 1.0.13 2024-07-17 [2] CRAN (R 4.4.1)
## rhdf5 2.50.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## rhdf5filters 1.18.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## Rhdf5lib 1.28.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## rlang 1.1.4 2024-06-04 [2] CRAN (R 4.4.1)
## rmarkdown 2.28 2024-08-17 [2] CRAN (R 4.4.1)
## RSQLite 2.3.7 2024-05-27 [2] CRAN (R 4.4.1)
## S4Arrays 1.6.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## S4Vectors * 0.44.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## scales 1.3.0 2023-11-28 [2] CRAN (R 4.4.1)
## sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.4.1)
## SparseArray 1.6.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## stats * 4.4.1 2024-09-25 [3] local
## stats4 * 4.4.1 2024-09-25 [3] local
## strawr 0.0.92 2024-07-16 [2] CRAN (R 4.4.1)
## SummarizedExperiment * 1.36.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## tibble 3.2.1 2023-03-20 [2] CRAN (R 4.4.1)
## tidyselect 1.2.1 2024-03-11 [2] CRAN (R 4.4.1)
## tools 4.4.1 2024-09-25 [3] local
## tzdb 0.4.0 2023-05-12 [2] CRAN (R 4.4.1)
## UCSC.utils 1.2.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## utf8 1.2.4 2023-10-22 [2] CRAN (R 4.4.1)
## utils * 4.4.1 2024-09-25 [3] local
## vctrs 0.6.5 2023-12-01 [2] CRAN (R 4.4.1)
## vroom 1.6.5 2023-12-05 [2] CRAN (R 4.4.1)
## withr 3.0.2 2024-10-28 [2] CRAN (R 4.4.1)
## xfun 0.48 2024-10-03 [2] CRAN (R 4.4.1)
## XVector 0.46.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
## yaml 2.3.10 2024-07-26 [2] CRAN (R 4.4.1)
## zlibbioc 1.52.0 2024-10-29 [2] Bioconductor 3.20 (R 4.4.1)
##
## [1] /home/biocbuild/bbs-3.20-books/tmpdir/RtmphzOvfD/Rinst2020235c565b68
## [2] /home/biocbuild/bbs-3.20-bioc/R/site-library
## [3] /home/biocbuild/bbs-3.20-bioc/R/library
##
## βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ