2008/10/29

Exercise of HSAUR Chapter 1

The book: A Handbook of Statistical Analyses Using R
data("Forbes2000", packages = "HSAUR")
##ex 1.1
> tmp <- subset(Forbes2000, country==c("United States","United Kingdom","France","Germany"))
> by(tmp$profits, as.character(tmp$country), FUN=function(x)median(x, na.rm=T))
INDICES: France
[1] 0.215
------------------------------------------------------------------------------
INDICES: Germany
[1] 0.245
------------------------------------------------------------------------------
INDICES: United Kingdom
[1] 0.17
------------------------------------------------------------------------------
INDICES: United States
[1] 0.26
##ex 1.2
> subset(Forbes2000, country=="Germany" & profits < 0, select = "name")
name
350 Allianz Worldwide
364 Deutsche Telekom
397 E.ON
431 HVB-HypoVereinsbank
500 Commerzbank
798 Infineon Technologies
869 BHW Holding
926 Bankgesellschaft Berlin
1034 W&W-Wustenrot
1187 mg technologies
1477 Nurnberger Beteiligungs
1887 SPAR Handels
1994 Mobilcom
##ex 1.3
> summary(subset(Forbes2000, country=='Bermuda', select = "category"))
category
Insurance :10
Conglomerates : 2
Oil & gas operations: 2
Banking : 1
Capital goods : 1
Food drink & tobacco: 1
(Other) : 3
==> Insurance
##ex 1.4
> o <- order(Forbes2000$profits, decreasing = T)
> tmp <- Forbes2000[o,]
> tmp1 <- tmp[1:50,]
> plot(tmp1$sales, log(tmp1$assets), main = "Forbes2000 Top 50 Profits Company's Sales vs Assets", xlab = "Sales", ylab = "log(Assets)")
> text(tmp1$sales, log(tmp1$assets), abbreviate(tmp1$country))
##ex 1.5
> by(Forbes2000$sales, as.character(Forbes2000$country), FUN = function(x) mean(x, na.rm = T))
> tmp <- Forbes2000[Forbes2000$profits>5, c("name","country")]
> summary(tmp$country)

rle and inverse.rle

rle return a list of run length of the vector, length for the run length, value for it's corresponding value
inverse.rle rollback the original vector

example:
> x <- c(1,0,0,1,1,1,1,1,0,0,0) > (tmp <- rle(x)) Run Length Encoding lengths: int [1:4] 1 2 5 3 values : num [1:4] 1 0 1 0

>inverse.rle(tmp)
[1] 1 0 0 1 1 1 1 1 0 0 0


Application: Finding peaks in vector
getPeaks <- function(x) { tmp <- rle(diff(x)>0) change.index <- cumsum(c(1, tmp$lengths)) ind <- change.index[which(tmp$values ==FALSE)] return(ind) }

> x<-c(rnorm(300),rnorm(700,5,2)) > d1<- density(x) > ind <- getPeaks(d1$y) > plot(d1,xlab="",ylab="") > ind [1] 149 279 > points(d1$x[ind], d1$y[ind], pch = 16, col="red")

2008/07/23

Hough Transformation

Ideal sample
library(PET)
testLine <- matrix(0, ncol=100, nrow=100)

diag(testLine) <- 1

ph <- hough(testLine)
viewData(list(testLine, ph$hData), list("line","houghTrans"))

Wafer Map:
library(rimage)
library(pixmap)

tmp <- read.jpeg("xxx.jpg")
tmp1 <- pixmapRGB(c(tmp[, 1:400, 1], tmp[, 1:400, 2], tmp[, 1:400, 3]), 400, 400)
tmp2 <- as(tmp1, "pixmapGrey")
tmp22 <- 1- tmp2@grey

hh <- hough(tmp22)
viewData(list(tmp2@grey, tmp22, hh$hData), list("W1", "W1 inverse","Hough Transformation"))

tmp22f <- matrix(ifelse(unlist(tmp22)<0.3,0,1),nrow=400,ncol=400)

hp <- hough(tmp22f) viewData(list(tmp2@grey,tmp22f,hp$hData),list("Original Map","Pattern Map","Hough Transformation"))

2008/07/02

R by

unique return without change the order
by return with the factor order
example:
> tmppar <- paste("X", c(1,1,1,3,3,8,8,6,6,5,5,2,2,7,7,4,4),sep="")
> tmppar
[1] "X1" "X1" "X1" "X3" "X3" "X8" "X8" "X6" "X6" "X5" "X5" "X2" "X2" "X7" "X7" "X4" "X4"
> unique(tmppar)
[1] "X1" "X3" "X8" "X6" "X5" "X2" "X7" "X4"
> tmp1 <- by(rnorm(length(tmppar)),tmppar,FUN=min)
> tmp1
INDICES: X1
[1] -0.7628715
-------------------------------------------------------------------------------------------------
INDICES: X2
[1] -0.5590477
-------------------------------------------------------------------------------------------------
INDICES: X3
[1] 1.214794
-------------------------------------------------------------------------------------------------
INDICES: X4
[1] 0.445656
-------------------------------------------------------------------------------------------------
INDICES: X5
[1] -0.8465884
-------------------------------------------------------------------------------------------------
INDICES: X6
[1] -1.096506
-------------------------------------------------------------------------------------------------
INDICES: X7
[1] -0.2734341
-------------------------------------------------------------------------------------------------
INDICES: X8
[1] 0.4012138

2008/05/08

Canonical Correlation Analysis

CCA(Canonical correlation Analysis) for the model AX = BY , where X and Y both have dimension more than one.
The R code for this,
library(CCA)
res <- cc(X, Y)
#compare coefficient under same PC's
ccaPlot <- function(res, choice = 1) {
par(mfrow=c(2,1))
barplot(res$xcoef[, choice], las =2, main = paste("Coefficient of PC", choice))
barplot(res$ycoef[, choice], las=2)

}

Sample graphic

2008/04/22

Using Parallel Coordinate Plot for variable selection

For detecting critical parameters, we can transform the problem into high-dimensional data problem. The parallel coordinate plot is good for high dimensions visualization. But it's lack of index for point out which variable is important. One can use this plot as first filtering method.



For construct above chart, we put the response variable in the first column, and sorting the data by the response variable, use different colors by grouping the data for easier detect.

2008/02/25

Data normality test

For testing the data is from normal or not, the Shapiro test is more powerful then other tests. But Shapiro has restrict in R, the sample size must less than 5000.
So facing data points great than 5000, we use Kolmogorov-Smirnov tests.
The code as following:


normalityTest <- function(x, group=NULL, alpha=0.05){

if(is.null(group)){
x <- na.omit(x)
if(all(x==x[1]) | length(x)<5 ){
return(FALSE)
} else if (length(x) > 5000){
return(ks.test(x, "pnorm", mean(x), sd(x))$p.value>alpha)
} else {
return(shapiro.test(x)$p.value>alpha)
}
} else {
ng <- as.character(unique(group))
for(ng.ind in 1:length(ng)){
tmp <- x[group==ng[ng.ind]]
tmp <- na.omit(tmp)
if(all(tmp==tmp[1]) | length(tmp)<5){
stop(return(FALSE))
} else {
if(length(tmp) > 5000){
p <- ks.test(x, "pnorm", mean(x), sd(x))$p.value
} else {
p <- shapiro.test(tmp)$p.value
}
if(p < alpha) stop(return(FALSE))
}
}
return(TRUE)
}
}

2008/02/21

Tukey HSD groups represent in R

For the post hoc comparison of the ANOVA, R provide TukeyHSD for pairwise comparison and graphics function. For more informative represent the result, it's a good one from the SAS output. The following code do the same thing as SAS except the order is from minimul to maximum

testResult <- function(x, ...) UseMethod("testResult")
testResult.default <- function(x){
if(! is(x,"aov")){
return("Not an ANOVA test")
}
tmp <- unlist(summary(x))
tmp1 <- tmp[which(names(tmp)=="Pr(>F)1", arr.ind=T)]
mm <- model.tables(x, "means")
tabs <- mm$tables[-1]
gname <- dimnames(tabs[[names(tabs)]])
mm1 <- unlist(mm)
nn <- matrix(mm1[-1], ncol = 2)
tmpsum <- data.frame(group = as.vector(gname), means = nn[,1], count = nn[,2])
o <- order(tmpsum$means)
tmpsum <- tmpsum[o,]
if(tmp1 < 0.05){
tmp <- TukeyHSD(x, order=T)
tmp1 <- tmp[[names(tmp)]]
tmp2 <- tmp1[,4]
ng <- nrow(tmpsum)
tmp.hsd <- matrix(1, ncol = ng, nrow = ng)
tmp.hsd[lower.tri(tmp.hsd, diag = FALSE)] <- tmp2
rownames(tmp.hsd) <- tmpsum$group
for(col.i in 1:(ng-1)){
tmp <- tmp.hsd[,col.i]
ind <- which(tmp>0.05, arr.ind = TRUE)
if(length(ind)>0){
ind <- ind[ind > col.i-1]
ind <- max(ind)
} else {
ind <- col.i
}
tmpsum$to[col.i] <- ind
}
tmpsum$to[ng] <- ng
tmpsum$fr <- 1:nrow(tmpsum)
nr <- unique(tmpsum$to)
tmpsum$g <- as.character(" ")
for(nr.ind in 1:length(nr)){
tmp <- subset(tmpsum, to == nr[nr.ind], select = c("to","fr"))
s.ind <- min(tmp$fr)
e.ind <- max(tmp$to)
if(nr.ind == 1){
tmpsum$g[s.ind:e.ind] <- as.character(LETTERS[nr.ind])
tmpsum$g[-(s.ind:e.ind)] <- as.character(" ")
} else {
tmpsum$g[s.ind:e.ind] <- paste(tmpsum$g[s.ind:e.ind], as.character(LETTERS[nr.ind]), sep="")
tmpsum$g[-(s.ind:e.ind)] <- paste(tmpsum$g[-(s.ind:e.ind)], as.character(" "), sep="")
}
}
out <- subset(tmpsum,, select=c("group","means","count","g"))
} else {
out <- data.frame(tmpsum, g = rep("A", nrow(tmpsum)))
}
return(out)
}

2008/01/30

Calculate variance from summary data


grandMean <- function(meanX, nX){
return(sum(meanX*nX)/sum(nX))
}

grandSTD <- function(meanX, stdX, nX){
gmean <- grandMean(meanX, nX)
ssr <- sum((nX-1)*stdX*stdX) + sum(nX*meanX*meanX)
ssm <- 2*sum(nX*meanX)*gmean
ssg <- sum(nX)*gmean*gmean
return(sqrt((ssr-ssm+ssg)/(sum(nX)-1)))
}

2008/01/14

Applying ICA in wafer map recognization

ICA(Independent Component Analysis)

Assumption: patterns are independent
Final map: regards as some patterns composite result
Training Step:
Step 1: Choose about 20 wafer maps from same pattern, applying ICA to each maps(suppose
have 2 or 3 sources)
Step 2: Averaging as the basis for the pattern

Classification Step:
For the unknown patterns map, applying ICA to get the basis
Using KNN to predicted

CC Copyright

創用 CC 授權條款
本著作由Chunhung Chou製作,以創用CC 姓名標示-相同方式分享 3.0 Unported 授權條款釋出。