This package implements the genome scan and genome-wide association studies using deep neural networks (i.e, Multi-Layer Perceptron (MLP), Convolutional Neural Network (CNN)). DeepGenomeScan offers heuristic computational framework integrating different neural network architectures (i.e.,Multi-Layer Perceptron (MLP), convolutional neural network(CNN)) and robust resampling methods, as well as the Model-Agnostic interpretation of feature importance for convolutional neural networks. DeepGenomeScan, in other words, deep learning for genome-wide scanning, is a deep learning-based approach for detecting signatures of natural selection or for performing omics-based genome-wide association studies, such as GWAS, PWAS, TWAS, MWAS. The framework makes the implementation user-friendly. It is compatible with most of the self-defined machine learning models (the self-defined models should be complete, including tunable parameters, fitted model, predicted model, examples can be found in our tutorial). Users can adopt this framework to study various evolutionary questions.
library("devtools")
## The modified version of caret
devtools::install_github("xinghuq/CaretPlus/pkg/caret")
###DA and KLFDAPC are used for SpGenomeScan
if (!requireNamespace("DA", quietly=TRUE))
devtools::install_github("xinghuq/DA")
requireNamespace("KLFDAPC")
if (!requireNamespace("KLFDAPC", quietly=TRUE))
devtools::install_github("xinghuq/KLFDAPC")
if (!requireNamespace("keras", quietly=TRUE))
install.packages("keras")
if (!requireNamespace("tensorflow", quietly=TRUE))
install.packages("tensorflow")
if (!requireNamespace("kerasR", quietly=TRUE))
install.packages("kerasR")
### The latest version of DeepGenomeScan
devtools::install_github("xinghuq/DeepGenomeScan")
library("rappdirs")
library("reticulate")
reticulate::use_python("/usr/bin/python3")
library(caret) ### for ML calling functions and performance estimation, users should use the modified version at xinghuq/CaretPlus/caret instead of the original version
library(keras)
library("tensorflow")
# checking if Tensorflow works properly
K0=keras::backend()
library("DeepGenomeScan")
library("caret")### for ML calling functions and performance estimation
library("keras") ### for DL
library("tensorflow")
library("caretEnsemble")
library("kerasR")
library("RSNNS")
library("NeuralNetTools")
f <- system.file('extdata',package='DeepGenomeScan')
infile <- file.path(f, "sim1.csv")
sim_example=read.csv(infile)
genotype=sim_example[,-c(1:14)]
env=sim_example[,2:11]
str(sim_example)
econtrol1 <- trainControl(## 5-fold CV, repeat 5 times
method = "adaptive_cv",
number = 5,
## repeated ten times
repeats = 5,search = "random")
set.seed(999)
options(warn=-1
GSmlp<- DeepGenomeScan(as.matrix(genotype_norm),env$envir1,
method="mlpSGD",
metric = "RMSE",## "Accuracy", "RMSE","Rsquared","MAE"
tuneLength = 10, ### random search 10 hyperparameter combinations
# tuneGrid=CNNGrid, or setting the tune grid
trControl = econtrol1)
#### Note the genotype_norm is a n*p genotype matrix
#### varIMP for SNPs
out=varImp(GSmlp,scale = FALSE)
#### Calculating p-values and plot Manhanttan plot, remember to scan all environmnet factors and use DLqvaluearsine function to convert the multi-effect importance values to q-values
### here is only plot importance of an example of using one environment factor
DLqvaluesarsine<-function(DL_data,K)
{
loadings<-DL_data# [,1:as.numeric(K)]
normdat <- apply(loadings, 2, normalize)
asindat=apply(normdat,2, function(x) {asin(sqrt(x))})
resmaha <- covRob(asindat, distance = TRUE, na.action= na.omit, estim="donostah")$dist
lambda <- median(resmaha)/qchisq(0.5,df=K)
reschi2test <- pchisq(resmaha/lambda,K,lower.tail=FALSE)
qval <- qvalue(reschi2test)
q.values_DL<-qval$qvalues
padj <- p.adjust(reschi2test,method="bonferroni")
return(data.frame(p.values=reschi2test, q.values=q.values_DL,padj=padj,mahaD=resmaha))
}
DLsim1=apply(out,2,normalize) #18
Simqvaluear=DLqvaluesarsine(DLsim1,1)
## Manhattan plot
ggplot() +
geom_point(aes(x=which(Loci=="Neutral"), y=-log10(Simqvaluear[-which(Loci!="Neutral"),1])), col = "gray83") +
geom_point(aes(x=which(Loci!="Neutral"), y=-log10(Simqvaluear[-which(Loci=="Neutral"),1]), colour = Selected_Loci)) +
xlab("SNPs") + ylab("DNN -log10(p-value)") +ylim(c(0,100))+theme_bw()
plot(out$Overall, ylab="SNP importance")
Welcome any feedback and pull request.