houses <- read.csv("train.csv", header=T, stringsAsFactors = T)
finalTest <- read.csv("test.csv", stringsAsFactors = T)
# NA fill "No ____"
correctNAlist <- c("Alley", "Fence", "PoolQC", "FireplaceQu",
"MiscFeature", "GarageQual",
"GarageCond","GarageFinish", "GarageType",
"BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
"BsmtFinType2", "MasVnrType")
for (item in correctNAlist) {
houses <- houses |>
mutate(
!!item := as.character(!!sym(item)),
!!item := replace_na(!!sym(item), paste("No_", item)),
!!item := as.factor(!!sym(item))
)
}
# NA fill 0
fill_0 <- c("LotFrontage", "MasVnrArea")
for (item in fill_0) {
houses <- houses |>
mutate(
!!item := replace_na(!!sym(item), 0)
)
}
houses <- houses |>
mutate(
Electrical = replace_na(Electrical, "SBrkr"),
GarageYrBlt = ifelse(GarageQual == "No_ GarageQual", 0, GarageYrBlt),
TotalSF = TotalBsmtSF + X1stFlrSF + X2ndFlrSF,
TotalSF = ifelse(TotalSF > 6000, mean(TotalSF), TotalSF),
PercBsmtFin = (TotalBsmtSF - BsmtUnfSF) / BsmtUnfSF,
PercBsmtFin = ifelse(is.na(PercBsmtFin) |
PercBsmtFin=="Inf", 0, PercBsmtFin),
Has2nd = ifelse(X2ndFlrSF == 0, 1, 0),
Has2nd = as.factor(Has2nd),
HasBsmt = ifelse(TotalBsmtSF == 0, 1, 0),
HasBsmt = as.factor(HasBsmt),
BsmtExcellent = ifelse(BsmtQual == "Ex", 1, 0),
GarageCar3 = ifelse(GarageCars == 3, 1, 0),
#### BOOLEAN ####
# 2nd Floor
X2ndFlr = ifelse(X2ndFlrSF > 0, 1, 0),
RichNeigh = case_when(
Neighborhood %in% c("StoneBr","NridgHt","NoRidge") ~ 18.6342,
Neighborhood %in% c("Blmngtn", "ClearCr", "CollgCr",
"Crawfor", "Gilbert", "NWAmes",
"SawyerW", "Somerst", "Timber", "Veenker") ~ 13.2532,
T ~ 7.3327
),
KitchenQ = case_when(
KitchenQual == "Ex" ~ 3.28555,
KitchenQual == "Gd" ~ -1.16439,
KitchenQual == "TA" ~ -1.88592,
KitchenQual == "Fa" ~ -2.22990,
),
OverallQ = case_when(
OverallQual == 1 ~ 0.4709,
OverallQual == 2 ~ -5.5332,
OverallQual == 3 ~ 18.5539,
OverallQual == 4 ~ 28.4397,
OverallQual == 5 ~ 34.9930,
OverallQual == 6 ~ 43.1099,
OverallQual == 7 ~ 53.9437,
OverallQual == 8 ~ 65.5431,
OverallQual == 9 ~ 82.3895,
OverallQual == 10 ~ 93.5527
),
customX = 42.872*(TotalSF) +
9658.661*(KitchenQ) +
1759.385*(OverallQ) +
4113.104*(RichNeigh),
megaSwitch = ifelse(MSZoning %in% c("RL", "FV", "RH"), 1, 0)
)
set.seed(122)
num_rows <- 1000 #1460 total
keep <- sample(1:nrow(houses), num_rows)
train <- houses[keep, ] #Use this in the lm(..., data=mytrain)
test <- houses[-keep, ] #Use this in the predict(..., newdata=mytest)
final.lm <- lm(SalePrice~customX + customX:GarageCar3 + GarageCar3,data=train)
b <- coef(final.lm)
ggplot(train, aes(x=customX, y=SalePrice, color=as.factor(GarageCar3))) +
geom_point() +
scale_y_continuous(limits = c(0, 630000), labels=c("$0", "$200k", "$400k", "600k")) +
scale_x_continuous(labels=c("0", "100k", "200k", "300k", "400k", "500k")) +
scale_color_manual(values = c("skyblue", "firebrick"), labels = c("No", "Yes")) +
stat_function(fun=function(x) b[1] + b[2]*x, color="skyblue3") +
stat_function(fun=function(x) (b[1] + b[3]) + (b[2] + b[4])*x, color="firebrick3") +
labs(title="Can you Predict a House's Sale Price",
subtitle="Just by knowing things about the house?",
x="Custom X Variable",
y="Sale Price ($)",
color="Does the house have \n a 3 car garage?")
I chose a two lines model that switches on whether or not the house has a 3 car garage, and I created a custom X variable which is made up of a linear combination of Total Square Feet, Kitchen Quality, Overall Quality, and what kind of neighborhood it was in (rich, middle, or poorer).
summary(final.lm) |>
pander()
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | -15975 | 3923 | -4.072 | 5.03e-05 |
customX | 0.8676 | 0.01851 | 46.87 | 3.57e-254 |
GarageCar3 | -84158 | 14575 | -5.774 | 1.034e-08 |
customX:GarageCar3 | 0.3429 | 0.04518 | 7.591 | 7.303e-14 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
1000 | 29537 | 0.8463 | 0.8458 |
For the 3 Car Garage, the model’s predicted the sales price to increase by $1.21 for each increase on the custom X axis, which correlates to the following:
For other houses, the model’s predicted the sales price to increase by $0.87 for each increase on the custom X axis. This decreases the average-price-increases spoken about for the 3 Car Garages by \(\approx 72\text{%}\). Here are the adjusted sales improvements:
y <- predict(final.lm, newdata=test)
ybar <- mean(test$SalePrice)
SSTO <- sum( (test$SalePrice - ybar)^2 )
# Compute SSE for each model using SalePrice - yhat
SSE <- sum( (test$SalePrice - y)^2 )
# Compute R-squared for each
rs <- 1 - SSE/SSTO
n <- length(test$SalePrice) #sample siz
p <- length(coef(final.lm))
rsa <- 1 - (n-1)/(n-p)*SSE/SSTO
my_output_table2 <- data.frame(Model = c("MyLM"), `Original R2` = c(summary(final.lm)$r.squared), `Orig. Adj. R-squared` = c(summary(final.lm)$adj.r.squared), `Validation R-squared` = c(rs), `Validation Adj. R^2` = c(rsa))
colnames(my_output_table2) <- c("Model", "Original $R^2$", "Original Adj. $R^2$", "Validation $R^2$", "Validation Adj. $R^2$")
knitr::kable(my_output_table2, escape=TRUE, digits=4)
Model | Original \(R^2\) | Original Adj. \(R^2\) | Validation \(R^2\) | Validation Adj. \(R^2\) |
---|---|---|---|---|
MyLM | 0.8463 | 0.8458 | 0.804 | 0.8027 |
This model validates pretty well, only decreasing in both the R-squared and adjusted R-squared by approximately 0.04. This model was trained on 1,000 houses’ data, and tested on 400 houses that were withheld in training. This tells us that this model would do pretty good at predicting house’s prices, even if it hadn’t seen it before. This model’s predictions are within $60k of the true value (95% of the time).
##
## Call:
## lm(formula = SalePrice ~ TotalSF + KitchenQ + OverallQ + RichNeigh +
## GrLivArea, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -134820 -16582 -816 16366 201466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -39945.576 5195.163 -7.689 3.56e-14 ***
## TotalSF 44.274 2.489 17.791 < 2e-16 ***
## KitchenQ 9614.834 981.777 9.793 < 2e-16 ***
## OverallQ 1484.607 118.248 12.555 < 2e-16 ***
## RichNeigh 4317.831 360.082 11.991 < 2e-16 ***
## GrLivArea 3.363 3.222 1.044 0.297
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30880 on 994 degrees of freedom
## Multiple R-squared: 0.8323, Adjusted R-squared: 0.8315
## F-statistic: 986.9 on 5 and 994 DF, p-value: < 2.2e-16
good_col <- c()
rsq <- c()
base_formula <- as.formula("SalePrice ~ customX")
base_model <- lm(base_formula, data = train)
r <- summary(base_model)$adj.r.squared + 0.01
for(col in colnames(train)) {
if(col %in% c("SalePrice", "Id", "LotConfig", "Condition1", "RoofMatl",
"Exterior2nd", "ExterCond", "Heating", "HeatingQC")) next
form2 <- paste("SalePrice~customX:", col, "+", col)
model <- lm(form2, data = train)
r2 <- summary(model)$adj.r.squared
if (r2 > r) {
good_col <- c(good_col, col)
rsq <- c(rsq, r2)
}
}
data.frame(col = good_col, rsq = rsq) |>
arrange(desc(rsq))
## col rsq
## 1 Neighborhood 0.8634673
## 2 SaleCondition 0.8478057
## 3 KitchenQual 0.8475114
## 4 BsmtExposure 0.8446646
## 5 BsmtQual 0.8435096
## 6 BsmtFinType1 0.8434191
## 7 BldgType 0.8433218
## 8 SaleType 0.8431049
## 9 FireplaceQu 0.8430820
ggplot(houses, aes(x=SalePrice, y=Neighborhood)) +
geom_boxplot(aes(color=Neighborhood))
ggplot(houses, aes(x=SalePrice, y=BsmtQual)) +
geom_boxplot(aes(color=BsmtQual))
cbind(train, R=final.lm$res, fit=final.lm$fit) |>
ggplot(aes(x=fit, y=R)) +
geom_point(aes(color=as.factor(GarageCars)))+
geom_smooth()
# houses |>
# filter(SalePrice > 500000) |>
# pairs(panel=panel.smooth)