## ----echo=FALSE, results="hide", message=FALSE-------------------------------- knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) library(BiocStyle) set.seed(42) ## ----------------------------------------------------------------------------- suppressMessages(library(immApex)) suppressMessages(library(keras3)) suppressMessages(library(ggplot2)) suppressMessages(library(viridis)) suppressMessages(library(magrittr)) ## ----tidy = FALSE------------------------------------------------------------- sequences <- generateSequences(prefix.motif = "CAS", suffix.motif = "YF", number.of.sequences = 1000, min.length = 8, max.length = 16) head(sequences) ## ----tidy = FALSE------------------------------------------------------------- nucleotide.sequences <- generateSequences(number.of.sequences = 1000, min.length = 8, max.length = 16, sequence.dictionary = c("A", "C", "T", "G")) head(nucleotide.sequences) ## ----tidy = FALSE, eval = reticulate::py_module_available("keras")------------ # variational.sequences <- variationalSequences(sequences, # encoder = "onehotEncoder", # number.of.sequences = 100, # encoder.hidden.dim = c(256, 128), # latent.dim = 16, # batch.size = 16, # call.threshold = 0.1) # head(variational.sequences) ## ----tidy = FALSE------------------------------------------------------------- mutated.sequences <- mutateSequences(sequences, n.sequence = 1, position.start = 3, position.end = 8) head(sequences) head(mutated.sequences) ## ----tidy = FALSE------------------------------------------------------------- data("immapex_example.data") Adaptive_example <- formatGenes(immapex_example.data[["Adaptive"]], region = "v", technology = "Adaptive", simplify.format = TRUE) head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")]) ## ----tidy = FALSE------------------------------------------------------------- TRBV_aa <- getIMGT(species = "human", chain = "TRB", frame = "inframe", region = "v", sequence.type = "aa") TRBV_aa[[1]][1] ## ----tidy = FALSE------------------------------------------------------------- Adaptive_example <- inferCDR(Adaptive_example, chain = "TRB", reference = TRBV_aa, technology = "Adaptive", sequence.type = "aa", sequences = c("CDR1", "CDR2")) Adaptive_example[200:210,c("CDR1_IMGT", "CDR2_IMGT")] ## ----tidy = FALSE------------------------------------------------------------- sequence.matrix <- onehotEncoder(input.sequences = c(sequences, mutated.sequences), convert.to.matrix = TRUE) head(sequence.matrix[,1:20]) ## ----tidy = FALSE------------------------------------------------------------- property.matrix <- propertyEncoder(input.sequences = c(sequences, mutated.sequences), method.to.use = "FASGAI", convert.to.matrix = TRUE) head(property.matrix[,1:20]) ## ----tidy = FALSE------------------------------------------------------------- mulit.property.matrix <- propertyEncoder(input.sequences = c(sequences, mutated.sequences), method.to.use = c("atchleyFactors", "kideraFactors"), convert.to.matrix = TRUE) head(mulit.property.matrix[,1:20]) ## ----tidy = FALSE------------------------------------------------------------- median.property.matrix <- propertyEncoder(input.sequences = c(sequences, mutated.sequences), method.to.use = "crucianiProperties", summary.function = "median") head(median.property.matrix[,1:3]) ## ----tidy = FALSE------------------------------------------------------------- geometric.matrix <- geometricEncoder(sequences, method.to.use = "BLOSUM62", theta = pi/3) head(geometric.matrix) ## ----tidy = FALSE------------------------------------------------------------- token.matrix <- tokenizeSequences(input.sequences = c(sequences, mutated.sequences), add.startstop = TRUE, start.token = "!", stop.token = "^", convert.to.matrix = TRUE) head(token.matrix[,1:18]) ## ----tidy = FALSE------------------------------------------------------------- ppm.matrix <- probabilityMatrix(sequences) head(ppm.matrix) ## ----tidy = FALSE------------------------------------------------------------- set.seed(42) back.freq <- sample(1:1000, 20) back.freq <- back.freq/sum(back.freq) pwm.matrix <- probabilityMatrix(sequences, max.length = 20, convert.PWM = TRUE, background.frequencies = back.freq) head(pwm.matrix) ## ----tidy = FALSE------------------------------------------------------------- adj.matrix <- adjacencyMatrix(sequences, normalize = FALSE) adj.matrix ## ----------------------------------------------------------------------------- property.matrix <- propertyEncoder(input.sequences = c(sequences, mutated.sequences), method.to.use = "FASGAI", convert.to.matrix = TRUE) property.sequences <- sequenceDecoder(property.matrix, encoder = "propertyEncoder", aa.method.to.use = "FASGAI", call.threshold = 1) head(sequences) head(property.sequences) ## ----------------------------------------------------------------------------- sequence.matrix <- onehotEncoder(input.sequences = c(sequences, mutated.sequences), convert.to.matrix = TRUE) OHE.sequences <- sequenceDecoder(sequence.matrix, encoder = "onehotEncoder") head(OHE.sequences) ## ----tidy = FALSE, eval = reticulate::py_module_available("keras")------------ # #Sampling to make Training/Validation Data Cohorts # set.seed(42) # num_sequences <- nrow(sequence.matrix) # indices <- 1:num_sequences # train_indices <- sample(indices, size = floor(0.8 * num_sequences)) # val_indices <- setdiff(indices, train_indices) # # x_train <- sequence.matrix[train_indices,] # x_val <- sequence.matrix[val_indices,] # # # Parameters # input_shape <- dim(x_train)[2] # epochs <- 20 # batch_size <- 128 # encoding_dim <- 40 # hidden_dim1 <- 256 # Hidden layer 1 size # hidden_dim2 <- 128 # Hidden layer 2 size # # es <- callback_early_stopping(monitor = "val_loss", # min_delta = 0, # patience = 4, # verbose = 1, # mode = "min") # # # Define the Model # input_seq <- layer_input(shape = c(input_shape)) # # # Encoder Layers # encoded <- input_seq %>% # layer_dense(units = hidden_dim1, name = "e.1") %>% # layer_batch_normalization(name = "bn.1") %>% # layer_activation('leaky_relu', name = "act.1") %>% # layer_dense(units = hidden_dim2, name = "e.2") %>% # layer_batch_normalization(name = "bn.2") %>% # layer_activation('leaky_relu', name = "act.2") %>% # layer_dense(units = encoding_dim, activation = 'selu', name = "latent") # # # Decoder Layers # decoded <- encoded %>% # layer_dense(units = hidden_dim2, name = "d.2") %>% # layer_batch_normalization(name = "bn.3") %>% # layer_activation('leaky_relu', name = "act.3") %>% # layer_dense(units = hidden_dim1, name = "d.1") %>% # layer_batch_normalization(name = "bn.4") %>% # layer_activation('leaky_relu', name = "act.4") %>% # layer_dense(units = input_shape, activation = 'sigmoid') # # # Autoencoder Model # autoencoder <- keras_model(input_seq, decoded) # autoencoder %>% keras3::compile(optimizer = optimizer_adam(learning_rate = 0.0001), # loss = "mse") # # # Train the model # history <- autoencoder %>% fit(x = x_train, # y = x_train, # validation_data = list(x_val, x_val), # epochs = epochs, # batch_size = batch_size, # shuffle = TRUE, # callbacks = es, # verbose = 0) # # plot(history) + # scale_color_viridis(option = "B", discrete = TRUE) + # scale_fill_manual(values = c("black","black")) + # theme_classic() ## ----tidy = FALSE------------------------------------------------------------- class1.sequences <- generateSequences(prefix.motif = "CAS", suffix.motif = "YF", number.of.sequences = 10000, min.length = 8, max.length = 16) class2.sequences <- generateSequences(prefix.motif = "CASF", suffix.motif = "YF", number.of.sequences = 10000, min.length = 8, max.length = 16) labels <- as.numeric(c(rep(0, 10000), rep(1, 10000))) classifier.matrix <- onehotEncoder(input.sequences = c(class1.sequences, class2.sequences), convert.to.matrix = TRUE) ## ----tidy = FALSE, eval = reticulate::py_module_available("keras")------------ # #Input shape will be 1D as we are using a matrix # input.shape <- dim(classifier.matrix)[2] # # #Simple model structure # classifier.model <- keras_model_sequential() %>% # layer_dense(units = 128, activation = "relu", # input_shape = c(input.shape)) %>% # layer_dense(units = 32, activation = "relu") %>% # layer_dense(units = 1, activation = "sigmoid") # # classifier.model %>% compile( # optimizer = optimizer_adam(learning_rate = 0.00001), # loss = "binary_crossentropy", # metrics = c("accuracy") # ) # # #Separating data and labels # set.seed(42) # val_indices <- sample(nrow(classifier.matrix), 10000*0.2) # x_val <- classifier.matrix[val_indices,] # x_train <- classifier.matrix[-val_indices,] # # val_labels <- labels[val_indices] # train_labels <- labels[-val_indices] # # #Training the classifier.model # history <- classifier.model %>% fit(x_train, # train_labels, # epochs = 20, # batch_size = 32, # validation_data = list(x_val, val_labels), # verbose = 0 # ) # # plot(history) + # scale_color_viridis(option = "B", discrete = TRUE) + # scale_fill_manual(values = c("black","black")) + # theme_classic() ## ----------------------------------------------------------------------------- sessionInfo()