houses <- read.csv("train.csv", header=T, stringsAsFactors = T)
finalTest <- read.csv("test.csv", stringsAsFactors = T)
  
# NA fill "No ____"
correctNAlist <- c("Alley", "Fence", "PoolQC", "FireplaceQu",
                   "MiscFeature", "GarageQual",
                   "GarageCond","GarageFinish", "GarageType",
                   "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
                   "BsmtFinType2", "MasVnrType")
for (item in correctNAlist) {
  houses <- houses |> 
    mutate(
      !!item := as.character(!!sym(item)),
      !!item := replace_na(!!sym(item), paste("No_", item)),
      !!item := as.factor(!!sym(item))
    )
}

# NA fill 0
fill_0 <- c("LotFrontage", "MasVnrArea")
for (item in fill_0) {
  houses <- houses |> 
    mutate(
      !!item := replace_na(!!sym(item), 0)
    )
}

houses <- houses |> 
  mutate(
    Electrical = replace_na(Electrical, "SBrkr"),
    GarageYrBlt = ifelse(GarageQual == "No_ GarageQual", 0, GarageYrBlt),
    TotalSF = TotalBsmtSF + X1stFlrSF + X2ndFlrSF,
    TotalSF = ifelse(TotalSF > 6000, mean(TotalSF), TotalSF),
    PercBsmtFin = (TotalBsmtSF - BsmtUnfSF) / BsmtUnfSF,
    PercBsmtFin = ifelse(is.na(PercBsmtFin) |
                           PercBsmtFin=="Inf", 0, PercBsmtFin),
    Has2nd = ifelse(X2ndFlrSF == 0, 1, 0),
    Has2nd = as.factor(Has2nd),
    HasBsmt = ifelse(TotalBsmtSF == 0, 1, 0),
    HasBsmt = as.factor(HasBsmt),
    BsmtExcellent = ifelse(BsmtQual == "Ex", 1, 0),
    GarageCar3 = ifelse(GarageCars == 3, 1, 0),
    #### BOOLEAN ####
      # 2nd Floor
    X2ndFlr = ifelse(X2ndFlrSF > 0, 1, 0),
    RichNeigh = case_when(
      Neighborhood %in% c("StoneBr","NridgHt","NoRidge") ~ 18.6342,
      Neighborhood %in% c("Blmngtn", "ClearCr", "CollgCr",
                          "Crawfor", "Gilbert", "NWAmes",
                          "SawyerW", "Somerst", "Timber", "Veenker") ~ 13.2532,
      T ~ 7.3327
    ),
    KitchenQ = case_when(
      KitchenQual == "Ex" ~ 3.28555,
      KitchenQual == "Gd" ~ -1.16439,
      KitchenQual == "TA" ~ -1.88592,
      KitchenQual == "Fa" ~ -2.22990,
    ),
    OverallQ = case_when(
      OverallQual == 1 ~ 0.4709,
      OverallQual == 2 ~ -5.5332,
      OverallQual == 3 ~ 18.5539,
      OverallQual == 4 ~ 28.4397,
      OverallQual == 5 ~ 34.9930,
      OverallQual == 6 ~ 43.1099,
      OverallQual == 7 ~ 53.9437,
      OverallQual == 8 ~ 65.5431,
      OverallQual == 9 ~ 82.3895,
      OverallQual == 10 ~ 93.5527
    ),
    customX = 42.872*(TotalSF) +
      9658.661*(KitchenQ) +
      1759.385*(OverallQ) +
      4113.104*(RichNeigh),
    megaSwitch = ifelse(MSZoning %in% c("RL", "FV", "RH"), 1, 0)
  )

set.seed(122)

num_rows <- 1000 #1460 total
keep <- sample(1:nrow(houses), num_rows)

train <- houses[keep, ] #Use this in the lm(..., data=mytrain)

test <- houses[-keep, ] #Use this in the predict(..., newdata=mytest)

final.lm <- lm(SalePrice~customX + customX:GarageCar3 + GarageCar3,data=train)

Graphing the Model

b <- coef(final.lm)

ggplot(train, aes(x=customX, y=SalePrice, color=as.factor(GarageCar3))) +
  geom_point() +
  scale_y_continuous(limits = c(0, 630000), labels=c("$0", "$200k", "$400k", "600k")) +
  scale_x_continuous(labels=c("0", "100k", "200k", "300k", "400k", "500k")) +
  scale_color_manual(values = c("skyblue", "firebrick"), labels = c("No", "Yes")) +
  stat_function(fun=function(x) b[1] + b[2]*x, color="skyblue3") + 
  stat_function(fun=function(x) (b[1] + b[3]) + (b[2] + b[4])*x, color="firebrick3") + 
  labs(title="Can you Predict a House's Sale Price",
       subtitle="Just by knowing things about the house?",
       x="Custom X Variable",
       y="Sale Price ($)",
       color="Does the house have \n    a 3 car garage?")

Interpreting the Model

I chose a two lines model that switches on whether or not the house has a 3 car garage, and I created a custom X variable which is made up of a linear combination of Total Square Feet, Kitchen Quality, Overall Quality, and what kind of neighborhood it was in (rich, middle, or poorer).

summary(final.lm) |> 
  pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -15975 3923 -4.072 5.03e-05
customX 0.8676 0.01851 46.87 3.57e-254
GarageCar3 -84158 14575 -5.774 1.034e-08
customX:GarageCar3 0.3429 0.04518 7.591 7.303e-14
Fitting linear model: SalePrice ~ customX + customX:GarageCar3 + GarageCar3
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1000 29537 0.8463 0.8458

3 Car Garage

For the 3 Car Garage, the model’s predicted the sales price to increase by $1.21 for each increase on the custom X axis, which correlates to the following:

  • For each square foot added to the house, the average price increases by $51.90.
  • Having the house in one of the following Neighborhoods, could increase the average price as much as $56269.18: “StoneBr”, “NridgHt”, or “NoRidge”
  • Upgrading the kitchen quality from:
    • Average to Excellent would, on average, increase the sale price by $60463.84
    • Average to Good would, on average, increase the sale price by $52027.85
  • Increasing overall quality from:
    • 5 to 6 would, on average, increase the sale price by $17286.85
    • 5 to 7 would, on average, increase the sale price by $40359.98
    • 5 to 8 would, on average, increase the sale price by $65063.63
Not 3 Car Garage

For other houses, the model’s predicted the sales price to increase by $0.87 for each increase on the custom X axis. This decreases the average-price-increases spoken about for the 3 Car Garages by \(\approx 72\text{%}\). Here are the adjusted sales improvements:

  • For each square foot added to the house, the average price increases by $37.20.
  • Having the house in one of the following Neighborhoods, could increase the average price as much as $40329.94: “StoneBr”, “NridgHt”, or “NoRidge”
  • Upgrading the kitchen quality from:
    • Average to Excellent would, on average, increase the sale price by $43336.38
    • Average to Good would, on average, increase the sale price by $37290.04
  • Increasing overall quality from:
    • 5 to 6 would, on average, increase the sale price by $12390.04
    • 5 to 7 would, on average, increase the sale price by $28927.3
    • 5 to 8 would, on average, increase the sale price by $46633.21

Adjusted R-Squared

y <- predict(final.lm, newdata=test)

ybar <- mean(test$SalePrice)

SSTO <- sum( (test$SalePrice - ybar)^2 )

# Compute SSE for each model using SalePrice - yhat
SSE <- sum( (test$SalePrice - y)^2 )

# Compute R-squared for each
rs <- 1 - SSE/SSTO


n <- length(test$SalePrice) #sample siz
p <- length(coef(final.lm))
rsa <- 1 - (n-1)/(n-p)*SSE/SSTO

my_output_table2 <- data.frame(Model = c("MyLM"), `Original R2` = c(summary(final.lm)$r.squared), `Orig. Adj. R-squared` = c(summary(final.lm)$adj.r.squared), `Validation R-squared` = c(rs), `Validation Adj. R^2` = c(rsa))

colnames(my_output_table2) <- c("Model", "Original $R^2$", "Original Adj. $R^2$", "Validation $R^2$", "Validation Adj. $R^2$")

knitr::kable(my_output_table2, escape=TRUE, digits=4)
Model Original \(R^2\) Original Adj. \(R^2\) Validation \(R^2\) Validation Adj. \(R^2\)
MyLM 0.8463 0.8458 0.804 0.8027

This model validates pretty well, only decreasing in both the R-squared and adjusted R-squared by approximately 0.04. This model was trained on 1,000 houses’ data, and tested on 400 houses that were withheld in training. This tells us that this model would do pretty good at predicting house’s prices, even if it hadn’t seen it before. This model’s predictions are within $60k of the true value (95% of the time).

Other Data Exploration (for my eyes)

## 
## Call:
## lm(formula = SalePrice ~ TotalSF + KitchenQ + OverallQ + RichNeigh + 
##     GrLivArea, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -134820  -16582    -816   16366  201466 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -39945.576   5195.163  -7.689 3.56e-14 ***
## TotalSF         44.274      2.489  17.791  < 2e-16 ***
## KitchenQ      9614.834    981.777   9.793  < 2e-16 ***
## OverallQ      1484.607    118.248  12.555  < 2e-16 ***
## RichNeigh     4317.831    360.082  11.991  < 2e-16 ***
## GrLivArea        3.363      3.222   1.044    0.297    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30880 on 994 degrees of freedom
## Multiple R-squared:  0.8323, Adjusted R-squared:  0.8315 
## F-statistic: 986.9 on 5 and 994 DF,  p-value: < 2.2e-16
good_col <- c()
rsq <- c()
base_formula <- as.formula("SalePrice ~ customX")
base_model <- lm(base_formula, data = train)
r <- summary(base_model)$adj.r.squared + 0.01

for(col in colnames(train)) {
  if(col %in% c("SalePrice", "Id", "LotConfig", "Condition1", "RoofMatl",
                "Exterior2nd", "ExterCond", "Heating", "HeatingQC")) next
  form2 <- paste("SalePrice~customX:", col, "+", col)
  model <- lm(form2, data = train)
  r2 <- summary(model)$adj.r.squared

  if (r2 > r) {
    good_col <- c(good_col, col)
    rsq <- c(rsq, r2)
  }
}
data.frame(col = good_col, rsq = rsq) |> 
  arrange(desc(rsq))
##             col       rsq
## 1  Neighborhood 0.8634673
## 2 SaleCondition 0.8478057
## 3   KitchenQual 0.8475114
## 4  BsmtExposure 0.8446646
## 5      BsmtQual 0.8435096
## 6  BsmtFinType1 0.8434191
## 7      BldgType 0.8433218
## 8      SaleType 0.8431049
## 9   FireplaceQu 0.8430820
ggplot(houses, aes(x=SalePrice, y=Neighborhood)) +
  geom_boxplot(aes(color=Neighborhood))

ggplot(houses, aes(x=SalePrice, y=BsmtQual)) +
  geom_boxplot(aes(color=BsmtQual))

cbind(train, R=final.lm$res, fit=final.lm$fit) |> 
ggplot(aes(x=fit, y=R)) +
  geom_point(aes(color=as.factor(GarageCars)))+
  geom_smooth()

# houses |> 
#   filter(SalePrice > 500000) |> 
# pairs(panel=panel.smooth)