##############################################################################
###
### Exercise 1
### ==========

### Before you can load a Bioconductor package, you need to install it with:
###   source("http://bioconductor.org/biocLite.R")
###   biocLite("Biostrings")
### The biocLite() script will take care of installing any other BioC or CRAN
### package that the package you are trying to install depends on.
library(Biostrings)

library(BSgenome)

available.genomes()

library(BSgenome.Dmelanogaster.UCSC.dm3)


##############################################################################
###
### Exercise 2
### ==========

seq <- DNAString("aagtac")

nchar(seq)
alphabetFrequency(seq)

reverseComplement(seq)

subseq(seq, start=2, end=4)
### The same substring can be extracted with
subseq(seq, start=2, width=3)
### or with
subseq(seq, end=4, width=3)
### or with
subseq(seq, end=-3, width=3)
### See ?subseq for more information.

### Note that subseq() doesn't copy the extracted sequence data in memory
### so subseq() is very fast and very efficient even when extracting a
### substring made of millions of nucleotides (e.g. from Human chr1).


##############################################################################
###
### Exercise 3
### ==========

x0 <- DNAStringSet(c("aagtac", "ccc", "gtt"))

width(x0)
length(x0)

x0[-2]

### 2 equivalent ways to invert the order of x0'elements:
###   1. by subsetting:
x0[3:1]
###   2. with rev():
rev(x0)

x0[[1]]

### You could do this:
DNAStringSet(x0, end=width(x0)-2)
### but this is much better:
DNAStringSet(x0, end=-3)

alphabetFrequency(x0)
### Note that you can collapse the matrix returned by alphabetFrequency()
### with:
alphabetFrequency(x0, collapse=TRUE)
### Also you can look at the frequencies of the base letters only:
alphabetFrequency(x0, baseOnly=TRUE)
alphabetFrequency(x0, baseOnly=TRUE, collapse=TRUE)

reverseComplement(x0)


##############################################################################
###
### Exercise 4
### ==========

library(hgu95av2probe)

### Only one single object is defined in a "probe" package, it has the name
### of the package:
hgu95av2probe
class(hgu95av2probe)
dim(hgu95av2probe)
### The "variables" of a data frame are its columns:
colnames(hgu95av2probe)
### In a data frame, all the variables always have the same length which is
### the number of rows of the data frame. We access a data frame variable with
### the $ operator:
length(hgu95av2probe$sequence)
### See the first 10 elements of the "sequence" variable:
hgu95av2probe$sequence[1:10]
nchar(hgu95av2probe$sequence[1:10])
### Yes they look like 25-mers which is what we expect for an Affymetrix chip.
### Let's store the "sequence" variable into a DNAStringSet object:
x1 <- DNAStringSet(hgu95av2probe$sequence)
x1

### First we get the matrix of nucleotide frequencies:
af1 <- alphabetFrequency(x1, baseOnly=TRUE, freq=TRUE)
colnames(af1)
### Then we get the vector of CG content
GCcontent <- af1[ , "C"] + af1[ , "G"]
### or equivalently:
GCcontent2 <- rowSums(af1[ , c("C", "G")])
### And finally we count (with sum()) the number of elements in GCcontent that
### are >= 0.8:
sum(GCcontent >= 0.8)

### And for the entire microarray:
af1global <- alphabetFrequency(x1, baseOnly=TRUE, collapse=TRUE, freq=TRUE)
sum(af1global[c("C","G")])  # 48%


##############################################################################
###
### Exercise 5
### ==========

subj <- DNAString("taacacgacctga")
v <- views(subj, start=c(4, 7, 1), end=c(9, 10, 2))
v

subject(v)
start(v)
end(v)
gaps(v)

alphabetFrequency(v)
DNAStringSet(v)


##############################################################################
###
### Exercise 6
### ==========

library(BSgenome.Celegans.UCSC.ce2)
Celegans

Celegans$chrI
Celegans$chrII

alphabetFrequency(Celegans$chrI, baseOnly=TRUE)
alphabetFrequency(Celegans$chrII, baseOnly=TRUE)
### yes, chrII contains IUPAC extended letters

library(BSgenome.Dmelanogaster.UCSC.dm3)
Dmelanogaster
alphabetFrequency(Dmelanogaster$chr2L, baseOnly=TRUE)
## yes, chr2L contains IUPAC extended letters

chr2Lhead <- subseq(Dmelanogaster$chr2L, start=1, end=5000000)
slices <- views(chr2Lhead, start=20:5000000 - 19, end=20:5000000)
slices

### Look at the examples in ?chartr for how to simulate a bisulfite
### transformation of a DNA sequence.


##############################################################################
###
### Exercise 7
### ==========

subject <- Dmelanogaster$chr2L

pattern1 <- DNAString("ACCACTGAA")
hits1 <- matchPattern(pattern1, subject)
hits1

pattern2 <- DNAString("ACCACTGAAATG")
hits2 <- matchPattern(pattern2, subject)
hits2

### and to search the minus strand:
rcpattern1 <- reverseComplement(pattern1)
rchits1 <- matchPattern(rcpattern1, subject)
rchits1

rcpattern2 <- reverseComplement(pattern2)
rchits2 <- matchPattern(rcpattern2, subject)
rchits2

### Look at the examples in ?reverseComplement for a discussion about why
### reverse complementing the pattern and not the subject is generally better.


##############################################################################
###
### Exercise 8
### ==========

subject <- Dmelanogaster$chr2L
matchPattern("GAACTTTGCCACTC", subject)

### We can replace the 6th letter by an N (in the IUPAC extended alphabet, N
### is a wildcard that matches any nucleotide, see the IUPAC_CODE_MAP
### predefined vector for the mapping between extended letters and what they
### match):
pattern <- DNAString("GAACTNTGCCACTC")
matchPattern(pattern, subject)

### Also we cannot use 'fixed=TRUE' if we want the N to be interpreted as
### an ambiguity:
matchPattern(pattern, subject, fixed=FALSE)
### In fact, we don't want the Ns in the subject to be interpreted as
### ambiguities, only in the pattern. This is achieved with:
matchPattern(pattern, subject, fixed="subject")


##############################################################################
###
### Exercise 9
### ==========

library(BSgenome.Hsapiens.UCSC.hg18)
chr1 <- Hsapiens$chr1
chr1
maskedratio(chr1)
active(masks(chr1))[1] <- TRUE # activate mask of assembly gaps
chr1
maskedratio(chr1)

alphabetFrequency(chr1)
alphabetFrequency(unmasked(chr1))

active(masks(chr1)) <- TRUE # activate all the masks

Ebox <- "CANNTG"
hits_allmasks <- matchPattern(Ebox, chr1, fixed=FALSE)
hits_allmasks
hits_nomasks <- matchPattern(Ebox, unmasked(chr1), fixed="subject")
hits_nomasks
### So by masking, we keep only E-boxes that are not in repeat regions.


##############################################################################
###
### Exercise 10
### ===========

chr2 <- Hsapiens$chr2
active(masks(chr2))[1] <- TRUE # activate mask of assembly gaps

alphabetFrequency(chr2) # we see 2855 intra-contig Ns!

v2 <- as(chr2 , "XStringViews")
v2 # we don't immediately see the intra-contig Ns but this doesn't mean that
   # they are not here
alphabetFrequency(v2) # yes they are here, in the 6th contig (6th view)

### A mask inversion is needed in order to get the inter-view gaps (or
### inter-contig gaps, or assembly gaps). That's what gaps() does:
gaps(v2)


##############################################################################
###
### Exercise 11
### ===========

library(BSgenome.Dmelanogaster.UCSC.dm3)
chrX <- Dmelanogaster$chrX

### A first attempt:
cpals_FchrX <- findComplementedPalindromes(chrX, min.armlength=30, max.looplength=10)
cpals_FchrX

### The search will be cleaner if we mask the assembly gaps:
active(masks(chrX))[1] <- TRUE
cpals_FchrXcleaner <- findComplementedPalindromes(chrX, min.armlength=30, max.looplength=10)
cpals_FchrXcleaner

### Now with Human chromosome 2:
cpals_Hchr2 <- findComplementedPalindromes(chr2, min.armlength=30, max.looplength=10)
cpals_Hchr2

af <- alphabetFrequency(cpals_Hchr2)
ii <- which(af[ , "A"] != 0 & af[ , "C"] != 0 & af[ , "G"] != 0 & af[ , "T"] != 0)
cpals_Hchr2[ii] # 105 palindromic regions contain the 4 bases


##############################################################################
###
### Exercise 12
### ===========

library(BSgenome.Hsapiens.UCSC.hg18)
library(hgu95av2probe)
dict0 <- DNAStringSet(hgu95av2probe$sequence)

### Preprocess:
pdict0 <- PDict(dict0)

### Find the hits:
mindex <- matchPDict(pdict0, Hsapiens$chr1)

### Extract the number of hits for each probe:
nhits <- countIndex(mindex)

### Probe(s) with the highest number of hits:
max(nhits) # highest number of hits per probe is 11293
dict0[nhits == max(nhits)] # see probe(s) with highest number of hits
as.data.frame(hgu95av2probe[nhits == max(nhits), ]) # get more information
                                                    # about this/these probe(s)

### Note that by using 'which.max(nhits)' instead of 'nhits == max(nhits)' above,
### you could have missed the other probes that have the same highest number of
### hits in case there were more than one. See ?which.max for the details.

which.max(nhits) # probe 201534
probe201534_hits <- mindex[[which.max(nhits)]]
views(Hsapiens$chr1, start=start(probe201534_hits), end=end(probe201534_hits))


##############################################################################
###
### Exercise 13
### ===========

library(BSgenome.Hsapiens.UCSC.hg18)
library(SNPlocs.Hsapiens.dbSNP.20071016)
hg18snp <- injectSNPs(Hsapiens, 'SNPlocs.Hsapiens.dbSNP.20071016')
hg18snp

chr1snps <- hg18snp$chr1
alphabetFrequency(chr1snps) # lots of IUPAC extended letters (1 extended
                            # letter per SNP!)
alphabetFrequency(Hsapiens$chr1) # no IUPAC extended letters (beside the Ns)

### Our first attempt to count the hits for the hgu95av2 probes fails because
### there are "too many IUPAC ambiguity letters in 'subject'":
#nhits <- countPDict(pdict0, chr1snps, fixed="pattern") # first attempt fails!

### We need to mask the assembly gaps for this to work properly:
active(masks(chr1snps))[1] <- TRUE
nhits <- countPDict(pdict0, chr1snps, fixed="pattern")

max(nhits) # highest number of hits per probe is now 11492 instead of 11293
           # (generally speaking, all probes have now more hits)

which.max(nhits) # probe 201534 again
probe201534 <- dict0[[which.max(nhits)]]
probe201534_hits <- matchPattern(probe201534, chr1snps, fixed="pattern")
consmat(probe201534_hits) # a simple confirmation that the hits contain SNPs
ii <- which(probe201534_hits != probe201534)
probe201534_hits[ii] # probe 201534 has 635 hits that contain at least 1
                     # known SNP