Objective

We study a simple linear regression of house price on living area using the Ames dataset:

\[ \text{Price} = \beta_0 + \beta_1 \, \text{Area} + \varepsilon, \] where Price is sale price in thousands of USD, and Area is above-ground living area in thousands of square feet.

Data and Variables

library(modeldata)
library(ggplot2)

# Load full dataset
data(ames, package = "modeldata")

# Subsample 100 rows for illustration (class-size friendly)
ids <- sample.int(nrow(ames), size = 100)
ames.m1 <- ames[ids, ]

# Create scaled variables for readability
ames.m1$Price <- ames.m1$Sale_Price / 1000      # thousand USD
ames.m1$Area  <- ames.m1$Gr_Liv_Area / 1000     # thousand sq. ft.

# Peek
head(ames.m1[, c("Sale_Price","Gr_Liv_Area","Price","Area")])

Model Fitting

# Fit simple linear regression
model <- lm(Price ~ Area, data = ames.m1)

# Summary output
summary(model)
## 
## Call:
## lm(formula = Price ~ Area, data = ames.m1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -138.750  -30.853   -1.033   27.466  189.763 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -5.172     17.927  -0.288    0.774    
## Area         127.596     11.107  11.488   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.51 on 98 degrees of freedom
## Multiple R-squared:  0.5739, Adjusted R-squared:  0.5695 
## F-statistic:   132 on 1 and 98 DF,  p-value: < 2.2e-16

Interpretation cheatsheet

Prediction Example

new_house <- data.frame(Area = 1.4)  # 1,400 sq ft
pred <- predict(model, newdata = new_house, interval = "prediction")
pred
##        fit     lwr      upr
## 1 173.4626 54.7476 292.1775

Visualization

Scatter Plot

p1 <- ggplot(ames.m1, aes(x = Area, y = Price)) +
  geom_point(size = 2, color = "blue") +
  labs(
    title = "Ames Housing Data",
    x = "House Area (thousand sq. ft.)",
    y = "House Price (thousand USD)"
  ) +
  theme(
    panel.background = element_rect(fill = "white", color = "black", size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text  = element_text(size = 12)
  )
p1
Area vs. Price (scaled units)

Area vs. Price (scaled units)

Regression Line + Residuals

# Add fitted values and residuals
ames.m1$fitted <- fitted(model)
ames.m1$resid  <- resid(model)

p2 <- ggplot(ames.m1, aes(x = Area, y = Price)) +
  geom_point(size = 2, color = "blue") +
  # segments showing residuals (vertical distance to fitted line)
  geom_segment(aes(xend = Area, yend = fitted),
               color = "gray50", linewidth = 0.4) +
  geom_smooth(method = "lm", se = FALSE, color = "red", linewidth = 0.8) +
  labs(
    title = "Fitted Linear Regression Model",
    x = "House Area (thousand sq. ft.)",
    y = "Price (thousand USD)"
  ) +
  theme(
    panel.background = element_rect(fill = "white", color = "black", size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text  = element_text(size = 12)
  )
p2
Fitted linear regression with residual segments

Fitted linear regression with residual segments

Assumption Checks (Optional)

par(mfrow = c(2, 2))
plot(model)

par(mfrow = c(1, 1))

Reading the diagnostics briefly:

Takeaways