http://www.stat.sc.edu/~hitchcock/516_sas_LOGISTIC_examp.txt
/* Data for the logistic regression example */
/* involving the cities' use of TIF */
/* that we studied in class */
/* Entering the data and naming the variables: */
/* Y is a binary response variable here. */
/* Y = 1 if a city used TIF. */
/* Y = 0 if the city did not use TIF. */
DATA citydata;
INPUT Y $ income;
CARDS;
0 9.2
0 12.9
0 9.2
1 9.6
0 9.3
1 10.1
0 9.4
1 10.3
0 9.5
1 10.9
0 9.5
1 10.9
0 9.5
1 11.1
0 9.6
1 11.1
0 9.7
1 11.1
0 9.7
1 11.5
0 9.8
1 11.8
0 9.8
1 11.9
0 9.9
1 12.1
0 10.5
1 12.2
0 10.5
1 12.5
0 10.9
1 12.6
0 11
1 12.6
0 11.2
1 12.6
0 11.2
1 12.9
0 11.5
1 12.9
0 11.7
1 12.9
0 11.8
1 12.9
0 12.1
1 13.1
0 12.3
1 13.2
0 12.5
1 13.5
;
run;
/* A simple plot of the data set */
goptions reset=all;
PROC GPLOT DATA=citydata;
PLOT Y*income;
run;
/* PROC LOGISTIC will fit a logistic regression model. */
/* The DESCENDING option is important! */
/* It defines mu as P(Y=1) as we did in class, rather than as P(Y=0). */
/* We create an output data set called NEW that contained the predicted probabilities. */
/* It also contains lower and upper bounds for a 95% CI for the true probability. */
PROC LOGISTIC DESCENDING DATA=citydata;
MODEL Y = INCOME / LACKFIT;
OUTPUT OUT=NEW P=PRED L=LOWER U=UPPER;
RUN;
/* Output: With the LACKFIT option, SAS provides a Hosmer-Lemeshow test for */
/* "H0: the logistic regression fits well". */
/* With a P-value of 0.4603, we cannot reject this null hypothesis. */
/* We have no reason to doubt the logistic model's fit. */
/* So it's fine to use the logistic regression model. */
/* The estimates beta_0-hat and beta_1-hat are -11.347 and 1.002. */
/* Estimated odds ratio = 2.723, and 95% CI for odds ratio is (1.526, 4.858). */
/* Output: SAS provides a likelihood-ratio test of H0: beta_1 = 0. Since the P-value */
/* is very small ( < .0001), we reject H0, conclude beta_1 is not zero, and conclude */
/* that income has a significant effect on the probability a city uses TIF. */
/* PLOTTING THE ESTIMATED LOGISTIC CURVE */
/* PROC PLOT gives a crude plot of the estimated logistic regression curve. */
/* The 95% CIs are also plotted. */
PROC PLOT DATA=NEW;
PLOT PRED*INCOME='P' LOWER*INCOME='L' UPPER*INCOME='U' / OVERLAY;
RUN;
/* Using PROC GPLOT gives a nicer-looking plot of the estimated logistic regression curve */
/* and the 95% CIs. */
/* This plot comes up in a separate window from the OUTPUT window. */
/* First the data must be sorted based on the X variable before using PROC GPLOT. */
/* (since we are going to connect the dots). */
/* We use a circle as the plotting symbol for the first curve (the predicted probabilities) */
/* and a star as the plotting symbol for the 2nd and 3rd curves (the CIs). */
PROC SORT DATA=NEW;
BY INCOME;
symbol1 i = join v=circle l=32 c = black;
symbol2 i = join v=star l=32 c = black;
symbol3 i = join v=star l=32 c = black;
PROC GPLOT DATA=NEW;
PLOT PRED*INCOME LOWER*INCOME UPPER*INCOME / OVERLAY;
RUN;