https://www.edx.org/course/the-analytics-edge

setwd("E:\\The Analytics Edge\\Unit 3 Logistic Regression")
framingham = read.csv("framingham.csv")

#### Randomly split the data into training and testing sets

library(caTools)
set.seed(1000)
split = sample.split(framingham$TenYearCHD, SplitRatio = 0.65) 划分测试集，训练集 train = subset(framingham, split==TRUE) test = subset(framingham, split==FALSE) #### Logistic Regression Model R中使用Logistic Regression的方法很简单，只要使用如下的格式。 framinghamLog = glm(TenYearCHD ~ ., data = train, family=binomial) summary(framinghamLog) Call: glm(formula = TenYearCHD ~ ., family = binomial, data = train) Deviance Residuals: Min 1Q Median 3Q Max -1.8487 -0.6007 -0.4257 -0.2842 2.8369 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -7.886574 0.890729 -8.854 < 2e-16 *** male 0.528457 0.135443 3.902 9.55e-05 *** age 0.062055 0.008343 7.438 1.02e-13 *** education -0.058923 0.062430 -0.944 0.34525 currentSmoker 0.093240 0.194008 0.481 0.63080 cigsPerDay 0.015008 0.007826 1.918 0.05514 . BPMeds 0.311221 0.287408 1.083 0.27887 prevalentStroke 1.165794 0.571215 2.041 0.04126 * prevalentHyp 0.315818 0.171765 1.839 0.06596 . diabetes -0.421494 0.407990 -1.033 0.30156 totChol 0.003835 0.001377 2.786 0.00533 ** sysBP 0.011344 0.004566 2.485 0.01297 * diaBP -0.004740 0.008001 -0.592 0.55353 BMI 0.010723 0.016157 0.664 0.50689 heartRate -0.008099 0.005313 -1.524 0.12739 glucose 0.008935 0.002836 3.150 0.00163 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 2020.7 on 2384 degrees of freedom Residual deviance: 1792.3 on 2369 degrees of freedom (371 observations deleted due to missingness) AIC: 1824.3 Number of Fisher Scoring iterations: 5  #### Predictions on the test set 在测试集上预测结果。 predictTest = predict(framinghamLog, type="response", newdata=test) #### Confusion matrix with threshold of 0.5 设置threshold为$0.5$，比较结果。 table(test$TenYearCHD, predictTest > 0.5)
    FALSE TRUE
0  1069    6
1   187   11


#### Plot ROC curve

library(ROCR)
ROCRpred = prediction(predictTest, test$TenYearCHD) ROCRperf = performance(ROCRpred, "tpr", "fpr") plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0,1,by=0.1), text.adj=c(-0.2,1.7)) #### Area Under the ROC Curve (AUC) 最后介绍一下AUC，AUC表示ROC曲线围成的面积，如下图所示： AUC表示随机给一个positive和negative的案例，我们预测正确的比例，可以如下计算。 as.numeric(performance(ROCRpred, "auc")@y.values) 0.742109466760632 将threshold设置为AUC。 table(test$TenYearCHD, predictTest > 0.742109466760632)
    FALSE
0  1075
1   198


#### Use multiple Imputation to deal with missing data

polling = read.csv("PollingData.csv")
summary(polling)
         State          Year        Rasmussen          SurveyUSA
Arizona    :  3   Min.   :2004   Min.   :-41.0000   Min.   :-33.0000
Arkansas   :  3   1st Qu.:2004   1st Qu.: -8.0000   1st Qu.:-11.7500
California :  3   Median :2008   Median :  1.0000   Median : -2.0000
Colorado   :  3   Mean   :2008   Mean   :  0.0404   Mean   : -0.8243
Connecticut:  3   3rd Qu.:2012   3rd Qu.:  8.5000   3rd Qu.:  8.0000
Florida    :  3   Max.   :2012   Max.   : 39.0000   Max.   : 30.0000
(Other)    :127                  NA's   :46         NA's   :71
DiffCount           PropR          Republican
Min.   :-19.000   Min.   :0.0000   Min.   :0.0000
1st Qu.: -6.000   1st Qu.:0.0000   1st Qu.:0.0000
Median :  1.000   Median :0.6250   Median :1.0000
Mean   : -1.269   Mean   :0.5259   Mean   :0.5103
3rd Qu.:  4.000   3rd Qu.:1.0000   3rd Qu.:1.0000
Max.   : 11.000   Max.   :1.0000   Max.   :1.0000


library(mice)

simple = polling[c("Rasmussen", "SurveyUSA", "PropR", "DiffCount")]
summary(simple)
   Rasmussen          SurveyUSA            PropR          DiffCount
Min.   :-41.0000   Min.   :-33.0000   Min.   :0.0000   Min.   :-19.000
1st Qu.: -8.0000   1st Qu.:-11.7500   1st Qu.:0.0000   1st Qu.: -6.000
Median :  1.0000   Median : -2.0000   Median :0.6250   Median :  1.000
Mean   :  0.0404   Mean   : -0.8243   Mean   :0.5259   Mean   : -1.269
3rd Qu.:  8.5000   3rd Qu.:  8.0000   3rd Qu.:1.0000   3rd Qu.:  4.000
Max.   : 39.0000   Max.   : 30.0000   Max.   :1.0000   Max.   : 11.000
NA's   :46         NA's   :71


imputed = complete(mice(simple))
summary(imputed)
   Rasmussen         SurveyUSA           PropR          DiffCount
Min.   :-41.000   Min.   :-33.000   Min.   :0.0000   Min.   :-19.000
1st Qu.: -8.000   1st Qu.:-11.000   1st Qu.:0.0000   1st Qu.: -6.000
Median :  3.000   Median :  1.000   Median :0.6250   Median :  1.000
Mean   :  1.703   Mean   :  2.014   Mean   :0.5259   Mean   : -1.269
3rd Qu.: 10.000   3rd Qu.: 18.000   3rd Qu.:1.0000   3rd Qu.:  4.000
Max.   : 39.000   Max.   : 30.000   Max.   :1.0000   Max.   : 11.000

polling$Rasmussen = imputed$Rasmussen
polling$SurveyUSA = imputed$SurveyUSA
summary(polling)
         State          Year        Rasmussen         SurveyUSA
Arizona    :  3   Min.   :2004   Min.   :-41.000   Min.   :-33.000
Arkansas   :  3   1st Qu.:2004   1st Qu.: -8.000   1st Qu.:-11.000
California :  3   Median :2008   Median :  3.000   Median :  1.000
Colorado   :  3   Mean   :2008   Mean   :  1.703   Mean   :  2.014
Connecticut:  3   3rd Qu.:2012   3rd Qu.: 10.000   3rd Qu.: 18.000
Florida    :  3   Max.   :2012   Max.   : 39.000   Max.   : 30.000
(Other)    :127
DiffCount           PropR          Republican
Min.   :-19.000   Min.   :0.0000   Min.   :0.0000
1st Qu.: -6.000   1st Qu.:0.0000   1st Qu.:0.0000
Median :  1.000   Median :0.6250   Median :1.0000
Mean   : -1.269   Mean   :0.5259   Mean   :0.5103
3rd Qu.:  4.000   3rd Qu.:1.0000   3rd Qu.:1.0000
Max.   : 11.000   Max.   :1.0000   Max.   :1.0000