---
title: 'GFM: alternate maximization and information criterion'
author: "Wei Liu"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{GFM: alternate maximization and information criterion}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

In this tutorial, we show that the alternate maximization (AM)  is used in the first step of the two-step estimation method and the information criterion (IC) method is adopted to choose the number of factors.



## Fit GFM model using simulated data

The package can be loaded with the command:
```{r  eval=FALSE}
library("GFM")
set.seed(1) # set a random seed for reproducibility.
```

### GFM can handle data with  homogeneous  normal variables
First, we generate the data with  homogeneous  normal variables.
```{r  eval=FALSE}
## Homogeneous  normal variables
  dat <- gendata(q = 2, n=100, p=100, rho=3)
```
Then, we set the algorithm parameters and fit model
```{r eval=FALSE}
# Obtain the observed data
  XList <- dat$XList # this is the data in the form of matrix list.
  str(XList)
  X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is  continous variable type.
  types <- 'gaussian' 
```
Third, we fit the GFM model with user-specified number of factors.
```{r  eval=FALSE}
# specify q=2
  gfm1 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE)
  
  # measure the performance of GFM estimators in terms of canonical correlations
  measurefun(gfm1$hH, dat$H0, type='ccor')
  measurefun(gfm1$hB, dat$B0, type='ccor')
```
The number of factors can also be determined by data-driven manners.
```{r  eval=FALSE}
# select q automatically
  hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:6, verbose = FALSE, parallelList=list(parallel=TRUE))
  hq
```


### GFM outperforms LFM in analyzing data with  heterogeous normal variables
First, we generate the data with  heterogeous  normal variables and set the parameters of algorithm.
```{r  eval=FALSE}
  dat <- gendata(seed=1, n=100, p=100, type='heternorm', q=2, rho=1)
 # Obtain the observed data
  XList <- dat$XList # this is the data in the form of matrix list.
  str(XList)
  X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is  continous variable type.
  types <- 'gaussian' 
```

Third, we fit the GFM model with user-specified number of factors and compare the results with that of linear factor models.
```{r  eval=FALSE}
# specify q=2
  gfm1 <- gfm(XList, types,  algorithm="AM", q=2, verbose = FALSE)
  
  # measure the performance of GFM estimators in terms of canonical correlations
  corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor')
  corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor')
  
  lfm1 <- Factorm(X, q=2)
  corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor')
  corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor')
  
  library(ggplot2)
  df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm),
                    Method =factor(rep(c('GFM', "LFM"), times=2)),
                    Quantity= factor(c(rep('factors',2), rep("loadings", 2))))
  ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)
```

The number of factors can also be determined by data-driven manners.
```{r  eval=FALSE}
# select q automatically
  hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:6, verbose = FALSE, parallelList=list(parallel=TRUE))
```

### GFM outperforms LFM in analyzing data with  Count(Poisson) variables
First, we generate the data with  Count(Poisson) variables and set the parameters of algorithm.
```{r  eval=FALSE}
  q <- 3; p <- 200
  dat <- gendata(seed=1, n=200, p=p, type='pois', q=q, rho=4)
  # Obtain the observed data
  XList <- dat$XList # this is the data in the form of matrix list.
  str(XList)
  X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is  continous variable type.
  types <- 'poisson'
```

Second, we we fit the GFM models given the true number of factors. 
```{r  eval=FALSE}
  system.time(
   gfm1 <- gfm(XList, types,  algorithm="AM", q=3, verbose = FALSE)
  )
```

```{r  eval=FALSE}
system.time(
  hq <- chooseFacNumber(XList, types, q_set=1:6, select_method = "IC", parallelList=list(parallel=TRUE))
)

``` 


Third, we compare the results with that of linear factor models.
```{r  eval=FALSE}

  # measure the performance of GFM estimators in terms of canonical correlations
  corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor')
  corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor')
  
  lfm1 <- Factorm(X, q=3)
  corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor')
  corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor')
  
  library(ggplot2)
  df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm),
                    Method =factor(rep(c('GFM', "LFM"), times=2)),
                    Quantity= factor(c(rep('factors',2), rep("loadings", 2))))
  ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)
```


### GFM outperforms LFM in analyzing data with  the mixed-types of count and categorical variables

First, we generate the data with  Count(Poisson) variables and set the parameters of algorithm. Then fit the GFM model with user-specified number of factors.
```{r  eval=FALSE}
  dat <- gendata(seed=1, n=200, p=200, type='pois_bino', q=2, rho=2)
  # Obtain the observed data
  XList <- dat$XList # this is the data in the form of matrix list.
  str(XList)
  X <- dat$X # this is the data in form of matrix
  # set variables' type, 'gaussian' means there is  continous variable type.
  types <- dat$types
  table(dat$X[,1])
  table(dat$X[, 200])
  # user-specified q=2
  gfm2 <- gfm(XList, types,  algorithm="AM", q=2, verbose = FALSE)
  measurefun(gfm2$hH, dat$H0, type='ccor')
  measurefun(gfm2$hB, dat$B0, type='ccor')
```


Third, we compare the results with that of linear factor models.
```{r  eval=FALSE}
  #  select q automatically
  hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:4, verbose = FALSE, parallelList=list(parallel=TRUE))
  # measure the performance of GFM estimators in terms of canonical correlations
  corH_gfm <- measurefun(gfm2$hH, dat$H0, type='ccor')
  corB_gfm <- measurefun(gfm2$hB, dat$B0, type='ccor')
  
```
Compare with linear factor models
```{r  eval=FALSE}
  lfm1 <- Factorm(dat$X, q=3)
  corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor')
  corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor')
  
  library(ggplot2)
  df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm),
                    Method =factor(rep(c('GFM', "LFM"), times=2)),
                    Quantity= factor(c(rep('factors',2), rep("loadings", 2))))
  ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)
```

## Session information
```{r}
sessionInfo()
```