\name{PRIM.example.data}
\alias{PRIM.example.data}
\alias{PRIM.example}
\alias{X.PRIM}
\alias{Y.PRIM}
\alias{true.Y.status.PRIM}
\alias{X.PRIM4D}
\alias{Y.PRIM4D}
\alias{true.Y.status.PRIM4D}
\docType{data}
\title{PRIM example: Simulated clustered data}
\description{

  \bold{PRIM-2Dimensional Dataset:}
  
  The simulated data comprises of three
  actual normally distributed clusters, denoted by 1, 2, and 3, which
  are true category statuses denoted by the
  vector 'true.Y.status.PRIM'.  In reality, the truth is unknown, and
  thus, the information about the true category status is hidden.  The
  rules obtained from PRIM will be compared to the true cluster categories, from
  which the data was generated.
  
  The stochastic structure of the data is a mixture model of normals.
  In our test dataset there were 1000 observations.
  
  Each of the 1000 'Y.PRIM' response comes from independent
  bernoulli distributions with a probability of 0.5.
  
  The 'X.PRIM' observations with 2 column variables and 1000
  row observations was made up of the following distributions
  conditional on the response 'Y.PRIM':
  
 
  \item{[True Cluster 1]}{1/4 of the observations when the response Y.PRIM=1,
    came from the following Multivariate Normal distribution.
    
    X.PRIM | Y.PRIM=1 ~ N([500,500],[[625, 0],[0, 625]])   
  }
  \item{[True Cluster 2]}{1/4 of the observations when the response Y.PRIM=1
    came from the following Multivariate Normal distribution.
    
    X.PRIM | Y.PRIM=1 ~ N([400,100],[[625, 0],[0, 625]])
  }
  \item{[True Cluster 3]}{1/4 of the observations when the response Y.PRIM=1
    came from the following Multivariate Normal distribution.
    
    X.PRIM | Y.PRIM=1 ~ N([200,900],[[625, 0],[0, 625]])
  }

  \item{[Noise]}{Each of the two column variables for 1/4 of the 
  observations when the response Y.PRIM=1 and all observations when Y.PRIM=0 came from
  the following uniform distribution with a range of integer 
  values between 0 and 1023.
  
  
  X1|Y.PRIM=1,Y.PRIM=0 ~ U(0,1023)
 
  X2|Y.PRIM=1,Y.PRIM=0 ~ U(0,1023)

  }
 

  \bold{PRIM-4Dimensional Dataset:}
 
  The simulated data comprises of three
  actual clusters, denoted by 1, 2, and 3, that are normally
  distributed.  These values for true category status are denoted by the
  vector true.Y.status.PRIM4D.  In reality, the truth is unknown, and
  thus, the information about the true category status is hidden.  The
  rules obtained will be compared to the true cluster categories, from
  which the data was generated.
  
  The stochastic structure of the data is a mixture model of normals.
  In our test dataset there were 2000 observations.
  
  Each of the 2000 Y.PRIM4D response comes from independent
  bernoulli distributions with a probability of 0.5.
  
  The X.PRIM4D matrix observations with 4 column variables and 2000
  row observations was made up of the following distributions
  conditional on the response Y.PRIM4D:
  
  \item{[True Cluster 1]}{1/4 of the observations when the response Y.PRIM4D=1,
    came from the following Multivariate Normal distribution.
    
    X.PRIM4D | Y.PRIM4D=1 ~ N([500,500,500,500],
    [[625, 0,0,0],[0,625,0,0], [0,0,625,0],[0,0,0,625]])   
  }
  
  \item{[True Cluster 2]}{1/4 of the observations when the response Y.PRIM4D=1
    came from the following Multivariate Normal distribution.
    
    X.PRIM4D | Y.PRIM4D=1 ~ N([200,1000,200,800],
    [[700, 0,0,0],[0,200,0,0], [0,0,100,0],[0,0,0,700]])   
  }
  
  \item{[True Cluster 3]}{1/4 of the observations when the response Y.PRIM4D=1
    came from the following Multivariate Normal distribution.
    
    X.PRIM4D | Y.PRIM4D=1 ~ N([1000,100,900,100],
    [[400, 0,0,0],[0,500,0,0], [0,0,600,0],[0,0,0,300]])
  }

  \item{[Noise]}{Each of the two column variables for 1/4 of the 
  observations when the response Y.PRIM4D=1 and all observations when Y.PRIM4D=0 came from
  the following uniform distribution with a range of integer 
  values between 0 and 1023.  Let Xi denote the i-th column of X.PRIM4D.
  
  
  X1|Y.PRIM4D=1,Y.PRIM4D=0 ~ U(0,1023)
 
  X2|Y.PRIM4D=1,Y.PRIM4D=0 ~ U(0,1023)

  X3|Y.PRIM4D=1,Y.PRIM4D=0 ~ U(0,1023)
 
  X4|Y.PRIM4D=1,Y.PRIM4D=0 ~ U(0,1023)
  }

}

\usage{

data(PRIM.example.data)


}
\format{
  \item{X.PRIM:}{
    A matrix of 2 columns named "X1" and "X2" with 1000 row observations.
  }
  \item{Y.PRIM}{
    A vector denoting the observed response corresponding to each row observation
    of 'X.PRIM' and having a length of 1000
  }
  \item{true.Y.status.PRIM}{
    A vector denoting the true cluster category corresponding to each row observation
    of 'X.PRIM' and having a length of 1000
  }
  
  \item{X.PRIM4D}{
    A matrix of 4 columns named "X1", "X2", "X3", and "X4" with 2000 row
    observations.
  }
  \item{Y.PRIM4D}{
    A vector denoting the observed response corresponding to each row observation
    of 'X.PRIM' and having a length of 2000
  }
  \item{true.Y.status.PRIM4D}{
    A vector denoting the true cluster category corresponding to each row observation
    of 'X.PRIM' and having a length of 2000
  }
 
}

\source{

 A dataset generated with the following code in the "example" section.
  
}
\references{
  See also \pkg{rfcprim} for examples
  using this simulated data.
}
\examples{
data(PRIM.example.data)

 ## The following is the source R code
 ## setting the seed will generate the same dataset
if (FALSE){
  if (require(MASS)){
    set.seed(20)
    ## Y.PRIM response binary data
    Y.PRIM <- rbinom(1000, 1, 0.5)

    ## X.PRIM matrix, rows corresponding to Y.PRIM
    ##           columns are the variables

    ## when Y.PRIM==1 there is a MVN distribution of the X.PRIM
    ## There is simulation for 3 clusters

    Sigma <- matrix(c(25,0,0,25),2,2)^2

    X1.y1 <- mvrnorm(n=ceiling(length(Y.PRIM[Y.PRIM==1])/4),
               c(500, 500), Sigma, empirical = FALSE)

    X2.y1 <- mvrnorm(n=ceiling(length(Y.PRIM[Y.PRIM==1])/4),
               c(400, 100), Sigma, empirical = FALSE)
 
    X3.y1 <- mvrnorm(n=ceiling(length(Y.PRIM[Y.PRIM==1])/4),
              c(200, 900), Sigma, empirical = FALSE)

    X4.1.y1 <-  runif(ceiling(length(Y.PRIM[Y.PRIM==1])/4), 0, 1023)
 
    X4.2.y1 <-  runif(ceiling(length(Y.PRIM[Y.PRIM==1])/4), 0, 1023)
    X4.y1 <- cbind(X4.1.y1, X4.2.y1)

    X.y1 <- rbind(X1.y1, X2.y1, X3.y1, X4.y1)[1:length(Y.PRIM[Y.PRIM==1]),]


    ## when Y.PRIM==0 there is only a uniform distribution of X's
    X1.y0 <- runif(length(Y.PRIM[Y.PRIM==0]), 0, 1023)
    X2.y0 <- runif(length(Y.PRIM[Y.PRIM==0]), 0, 1023)
    X.y0 <- cbind(X1.y0, X2.y0)

    ## true Y.PRIM cluster status:
    ## 0, 4 is noise
    ## 1, 2, 3, are the true clusters

    true.Y.status.PRIM <- c(rep(2, dim(X1.y1)[1]),
    rep(1, dim(X2.y1)[1]), rep(3, dim(X3.y1)[1]),
    rep(4, dim(X4.y1)[1]))[1:length(Y.PRIM[Y.PRIM==1])]

    true.Y.status.PRIM <- c(true.Y.status.PRIM, rep(0,dim(X.y0)[1]))

    ## sort the Y.PRIM to correspond with the X.PRIM matrix
    Y.PRIM<- sort(Y.PRIM, decreasing=TRUE)
    X.PRIM <- rbind(X.y1, X.y0)
    colnames(X.PRIM) <- c("X1", "X2") 
  }
   ## 4D Dataset

  if (require(MASS)==TRUE){
    Y.PRIM4D <- rbinom(2000, 1, 0.5)
    ## when Y.PRIM4D==1 there is a MVN distribution of the X
    Sigma1 <- rbind(c(625,0,0,0),
                    c(0, 625, 0, 0),
                    c(0, 0, 625, 0),
                    c(0, 0, 0, 625))
    Sigma2 <- rbind(c(700, 0, 0, 0),
                    c(0, 200, 0, 0),
                    c(0, 0, 100,0),
                    c(0, 0, 0, 700))
    Sigma3 <- rbind(c(400,0,0,0),
                    c(0,500,0,0),
                    c(0,0,600,0),
                    c(0, 0,0,300))

    X1.y1 <- mvrnorm(n=ceiling(length(Y.PRIM4D[Y.PRIM4D==1])/4),
                   c(500, 500, 500, 500), Sigma1, empirical = FALSE)
    X2.y1 <- mvrnorm(n=ceiling(length(Y.PRIM4D[Y.PRIM4D==1])/4),
                   c(200, 1000, 200, 800), Sigma2, empirical = FALSE)
    X3.y1 <- mvrnorm(n=ceiling(length(Y.PRIM4D[Y.PRIM4D==1])/4),
                   c(1000, 100, 900, 100), Sigma3, empirical = FALSE)

    X4.1.y1 <-  runif(ceiling(length(Y.PRIM4D[Y.PRIM4D==1])/4), 0, 1023)
    X4.2.y1 <-  runif(ceiling(length(Y.PRIM4D[Y.PRIM4D==1])/4), 0, 1023)
    X4.3.y1 <-  runif(ceiling(length(Y.PRIM4D[Y.PRIM4D==1])/4), 0, 1023)
    X4.4.y1 <-  runif(ceiling(length(Y.PRIM4D[Y.PRIM4D==1])/4), 0, 1023)

    X4.y1 <- cbind(X4.1.y1, X4.2.y1, X4.3.y1, X4.4.y1)

    X.y1 <- rbind(X1.y1, X2.y1, X3.y1, X4.y1)[1:length(Y.PRIM4D[Y.PRIM4D==1]),]
    ## when Y.PRIM4D==0 there is only a uniform distribution of X's
    X1.y0 <- runif(length(Y.PRIM4D[Y.PRIM4D==0]), 0, 1023)
    X2.y0 <- runif(length(Y.PRIM4D[Y.PRIM4D==0]), 0, 1023)
    X3.y0 <- runif(length(Y.PRIM4D[Y.PRIM4D==0]), 0, 1023)
    X4.y0 <- runif(length(Y.PRIM4D[Y.PRIM4D==0]), 0, 1023)
    X.y0 <- cbind(X1.y0, X2.y0, X3.y0, X4.y0)
    ## true if in a cluster otherwise FALSE
    true.Y.status.PRIM4D <- c(rep(2, dim(X1.y1)[1]), rep(1, dim(X2.y1)[1]), rep(3, dim(X3.y1)[1]),  
                       rep(4, dim(X4.y1)[1]))[1:length(Y.PRIM4D[Y.PRIM4D==1])]
    ## NOTE: true.Y.status.PRIM4D=0,4 is uniform, random noise; true.Y.status.PRIM4D=1,2,3
    ## denotes the real cluster category
    true.Y.status.PRIM4D <- c(true.Y.status.PRIM4D, rep(0,dim(X.y0)[1]))
    Y.PRIM4D <- sort(Y.PRIM4D, decreasing=TRUE)
    X.PRIM4D <- rbind(X.y1, X.y0)
    colnames(X.PRIM4D) <- c("X1", "X2", "X3", "X4")
    }
  }
}
\keyword{datasets}