Data Engineer working with multiple Big Data technologies and Machine Learning: Logistic Regression in R with and without R library

Logistic Regression with Regularization without using R library

costFunctionReg.R

 costFunctionReg <- function(theta,X,y,lambda){  
  m <- length(y);  
  trunctheta = theta[2:length(theta)]  
  J = -1/m *( sum( y * log(sigmoid(X%*%theta))) + sum((1 - y )*log(1 - sigmoid(X %*% theta))))+(lambda/(2 * m))*sum(trunctheta ^ 2);  
 }

}

Feature Mapping-

 mapFeature <- function(x1,x2,degree){  
  out <- matrix(1,length(x1));  
  for (i in 1:degree){  
   for (j in 0:i){  
    out <- cbind(out, (x1^(i-j))*(x2^j));  
   }  
  }  
  return(out)  
 }

sigmoid function

 sigmoid <- function(z) {  
  1/(1 + exp(-z))  
 }

Predict Function

 # Predictor function  
 predict <- function(theta, X){  
  m <- nrow(X)  
  p <- sigmoid(X%*%theta) > .5  
 }

entire-

 require(Hmisc);  
 source('sigmoid.R')  
 source('mapFeature.R')  
 source('costFunctionReg.R')  
 source('predict.R')  
 setwd("D:/tmp/mlclass-ex1-005/mlclass-ex2-005/R-Studio/matlab");  
 mydata <- read.table("D:/tmp/mlclass-ex1-005/mlclass-ex2-005/R-Studio/ex2data2.txt",header=FALSE,sep=",")  
 names(mydata) <- c("Microchip Test 1","Microchip Test 2","Accepted");  
 x <- mydata[,1:2];  
 y <- mydata[,3];  
 # Plot data to understand the classification problem  
 plot(x,pch=c(24,3)[y+1],bg='yellow')  
 legend(.8,1,c('y = 1','y = 0'),pch=c(3,23),pt.bg='yellow')  
 x <- mapFeature(x[,1],x[,2],6)  
 #Following as well can be used  
 #x <- mapFeature(mydata$"Microchip Test 1",mydata$"Microchip Test 2",6)  
 initial_theta <- matrix(0,ncol(x));  
 # Contour plot of decision boundary  
 Xbase <- mydata[,1:2]  
 y <- mydata[,3]  
 plot(Xbase,pch=c(24,3)[y+1],bg='yellow')  
 legend(.8,1,c('y = 1','y = 0'),pch=c(3,23),pt.bg='yellow')  
 legend(.85,-0.20,c('BL = 0','B = 1','Y = 20','R = 50','G = 100','GR = 400'),pch=c(3,23),pt.bg='yellow')  
 lambda.array <- c(0,1,20,50,100,150);  
 color.array <- c('black','darkblue','yellow','red','green','gainsboro');  
 # Set regularization parameter lambda to 1  
 lambda = 1;  
 d <- 0;  
 VAT <- numeric(0);  
 for( templambda in lambda.array){  
 d <- d + 1;  
 result <- optim(initial_theta,costFunctionReg,gradFunction,method="BFGS",hessian = FALSE,X=x,y=y,lambda = templambda,control=list(maxit=400));  
 theta <- result$par;  
 p <- predict(theta,x);  
 message('Train Accuracy: ', mean((p == y)) * 100)  
 u <- seq(from=-1, to=1.5, by=.05)  
 v <- u  
 z <- matrix(0,length(u),length(v))  
 for (i in 1:length(u)){  
  for (j in 1:length(v)){  
   z[i,j] = mapFeature(u[i],v[j],6)%*%theta  
  }  
 }  
 contour(u,v,z,nlevels=1,add=T,lwd=2,col=color.array[d],drawlabels=F)  
 }

The output with various lambda value-

Logistic Regression with Regularization WITH R glm library

 data <- read.table("D:/tmp/mlclass-ex1-005/mlclass-ex2-005/R-Studio/ex2data2.txt",header=TRUE,sep=",")  
 names(data) <- c("test1","test2","accept");  
 head(data)  
 model <- glm(accept ~ test1 + test2 , family = binomial("logit"),data=data )  
 summary(model)  
 plot(model,which=1)  
 plot(predict.glm(model),residuals(model))  
 abline(h=0,lty=2,col="green")  
 in_frame<-data.frame(test1=1.0709,test2=0.10015)  
 #?predict.glm  
 output <- predict.glm(model,newdata=in_frame,type = "response")  
 #Any probability or output value if greater than 0.5 means , its 1 else 0  
 if(output >= 0.5){  
  message(" Success ",output)  
 }else{  
  message(" FAILURE ",output)  
 }