In Class exercise 13

Analysis
R
sf
spdep
GWmodel
SpatialML
tmap
rsample
yardstick
tidyverse
knitr
kableExtra
spatialRF
Author

Brian Lim

Published

November 4, 2024

Modified

November 12, 2024

Getting Started

Import the packages

pacman::p_load(sf, spdep, GWmodel, SpatialML, 
               tmap, rsample, yardstick, tidyverse, 
               knitr, kableExtra, spatialRF)

Preparing Data

mdata <- read_rds("data/In-class_Ex13/rds/mdata.rds")

Data Sampling

Calibrating predictive models are computational intensive, especially random forest method is used. For quick prototyping, a 10% sample will be selected at random from the data by using the code chunk below.

set.seed(1234)
HDB_sample <- mdata %>%
  sample_n(1500)

Checking of overlapping point

overlapping_points <- HDB_sample %>%
  mutate(overlap = lengths(st_equals(., .)) >1)
summary(overlapping_points$overlap)
   Mode   FALSE    TRUE 
logical    1047     453 

Spatial jitter

HDB_sample <- HDB_sample %>%
  st_jitter(amount = 5)

Data Sampling for training/test data

set.seed(1234)
resale_split <- initial_split(HDB_sample,
                              prop = 6.67/10,)
train_data <- training(resale_split)
test_data <- testing(resale_split)
write_rds(train_data, "data/In-class_Ex13/rds/train_data.rds")
write_rds(test_data, "data/In-class_Ex13/rds/test_data.rds")
train_data <- read_rds("data/In-class_Ex13/rds/train_data.rds")
test_data <-  read_rds("data/In-class_Ex13/rds/test_data.rds")

Multicollinearity check

mdata_nogeo <- mdata %>%
  st_drop_geometry()
ggstatsplot::ggcorrmat(mdata_nogeo[,2:17], matrix.type = "full")

Building a non-spatial multiple linear regression

price_mlr <- lm(resale_price ~ floor_area_sqm +
                  storey_order + remaining_lease_mths +
                  PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                  WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
                  WITHIN_1KM_PRISCH,
                data=train_data)
summary(price_mlr)

Call:
lm(formula = resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths + 
    PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + 
    PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN + 
    WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH, 
    data = train_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-205193  -39120   -1930   36545  472355 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)              107601.073  10601.261  10.150  < 2e-16 ***
floor_area_sqm             2780.698     90.579  30.699  < 2e-16 ***
storey_order              14299.298    339.115  42.167  < 2e-16 ***
remaining_lease_mths        344.490      4.592  75.027  < 2e-16 ***
PROX_CBD                 -16930.196    201.254 -84.124  < 2e-16 ***
PROX_ELDERLYCARE         -14441.025    994.867 -14.516  < 2e-16 ***
PROX_HAWKER              -19265.648   1273.597 -15.127  < 2e-16 ***
PROX_MRT                 -32564.272   1744.232 -18.670  < 2e-16 ***
PROX_PARK                 -5712.625   1483.885  -3.850 0.000119 ***
PROX_MALL                -14717.388   2007.818  -7.330 2.47e-13 ***
PROX_SUPERMARKET         -26881.938   4189.624  -6.416 1.46e-10 ***
WITHIN_350M_KINDERGARTEN   8520.472    632.812  13.464  < 2e-16 ***
WITHIN_350M_CHILDCARE     -4510.650    354.015 -12.741  < 2e-16 ***
WITHIN_350M_BUS             813.493    222.574   3.655 0.000259 ***
WITHIN_1KM_PRISCH         -8010.834    491.512 -16.298  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 61650 on 10320 degrees of freedom
Multiple R-squared:  0.7373,    Adjusted R-squared:  0.737 
F-statistic:  2069 on 14 and 10320 DF,  p-value: < 2.2e-16

Predictive Modelling with gwr

Computing bw

gwr_bw_train_ad <- bw.gwr(resale_price ~ floor_area_sqm +
                  storey_order + remaining_lease_mths +
                  PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                  WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
                  WITHIN_1KM_PRISCH,
                  data=train_data,
                  approach="CV",
                  kernel="gaussian",
                  adaptive=TRUE,
                  longlat=FALSE)
Take a cup of tea and have a break, it will take a few minutes.
          -----A kind suggestion from GWmodel development group
Adaptive bandwidth: 6395 CV score: 3.60536e+13 
Adaptive bandwidth: 3960 CV score: 3.320316e+13 
Adaptive bandwidth: 2455 CV score: 2.928339e+13 
Adaptive bandwidth: 1524 CV score: 2.550957e+13 
Adaptive bandwidth: 950 CV score: 1.95632e+13 
Adaptive bandwidth: 593 CV score: 1.58347e+13 
Adaptive bandwidth: 375 CV score: 1.310042e+13 
Adaptive bandwidth: 237 CV score: 1.113152e+13 
Adaptive bandwidth: 155 CV score: 9.572037e+12 
Adaptive bandwidth: 101 CV score: 8.457003e+12 
Adaptive bandwidth: 71 CV score: 7.605058e+12 
Adaptive bandwidth: 49 CV score: 6.965752e+12 
Adaptive bandwidth: 38 CV score: 8.249935e+12 
Adaptive bandwidth: 58 CV score: 7.275234e+12 
Adaptive bandwidth: 45 CV score: 6.871439e+12 
Adaptive bandwidth: 41 CV score: 6.7928e+12 
Adaptive bandwidth: 40 CV score: 6.780447e+12 
Adaptive bandwidth: 38 CV score: 8.249935e+12 
Adaptive bandwidth: 40 CV score: 6.780447e+12 

Model Calibration

gwr_ad <- gwr.basic(formula = resale_price ~
                            floor_area_sqm + storey_order +
                            remaining_lease_mths + PROX_CBD + 
                            PROX_ELDERLYCARE + PROX_HAWKER +
                            PROX_MRT + PROX_PARK + PROX_MALL + 
                            PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                            WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
                            WITHIN_1KM_PRISCH,
                          data=train_data,
                          bw=20, 
                          kernel = 'gaussian', 
                          adaptive=TRUE,
                          longlat = FALSE)

Computing test data bw

gwr_bw_test_adaptive <- bw.gwr(resale_price ~ floor_area_sqm +
                  storey_order + remaining_lease_mths +
                  PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL + 
                  PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                  WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
                  WITHIN_1KM_PRISCH,
                  data=test_data,
                  approach="CV",
                  kernel="gaussian",
                  adaptive=TRUE,
                  longlat=FALSE)
Take a cup of tea and have a break, it will take a few minutes.
          -----A kind suggestion from GWmodel development group
Adaptive bandwidth: 3447 CV score: 1.902155e+13 
Adaptive bandwidth: 2138 CV score: 1.752645e+13 
Adaptive bandwidth: 1328 CV score: 1.556299e+13 
Adaptive bandwidth: 828 CV score: 1.357498e+13 
Adaptive bandwidth: 518 CV score: 1.030751e+13 
Adaptive bandwidth: 327 CV score: 8.348364e+12 
Adaptive bandwidth: 208 CV score: 6.860544e+12 
Adaptive bandwidth: 135 CV score: 5.969504e+12 
Adaptive bandwidth: 89 CV score: 5.242221e+12 
Adaptive bandwidth: 62 CV score: 4.742767e+12 
Adaptive bandwidth: 43 CV score: 4.357839e+12 
Adaptive bandwidth: 34 CV score: 4.125848e+12 
Adaptive bandwidth: 25 CV score: 4.04299e+12 
Adaptive bandwidth: 23 CV score: 1.549626e+13 
Adaptive bandwidth: 30 CV score: 4.074906e+12 
Adaptive bandwidth: 25 CV score: 4.04299e+12 

Predicting with test data

gwr_pred <- gwr.predict(formula = resale_price ~
                          floor_area_sqm + storey_order +
                          remaining_lease_mths + PROX_CBD + 
                          PROX_ELDERLYCARE + PROX_HAWKER + 
                          PROX_MRT + PROX_PARK + PROX_MALL + 
                          PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                          WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + 
                          WITHIN_1KM_PRISCH, 
                        data=train_data, 
                        predictdata = test_data, 
                        bw=20, 
                        kernel = 'gaussian', 
                        adaptive=TRUE, 
                        longlat = FALSE)

Saving predicted values

gwr_pred_df <- as.data.frame(
  gwr_pred$SDF$prediction) %>%
  rename(gwr_pred = "gwr_pred$SDF$prediction")

Predictive Modelling with RF method

Data preparation

coords <- st_coordinates(HDB_sample)
coords_train <- st_coordinates(train_data)
coords_test <- st_coordinates(test_data)

To drop geometry column of the sf data.frame by using st_drop_geometry() of sf package.

train_data_nogeom <- train_data %>% 
  st_drop_geometry()

Calibrating RF model

set.seed(1234)
rf <- ranger(resale_price ~ floor_area_sqm + storey_order + 
               remaining_lease_mths + PROX_CBD + PROX_ELDERLYCARE + 
               PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL + 
               PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
               WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + 
               WITHIN_1KM_PRISCH,
             data=train_data_nogeom)

Preparing the test data

test_data_nogeom <- cbind(
  test_data,coords_test) %>%
  st_drop_geometry()

Predicting with rf

rf_pred <- predict(rf, data = test_data_nogeom)

Saving the predicted values

rf_pred_df <- as.data.frame(rf_pred$predictions) %>%
  rename(rf_pred = "rf_pred$predictions")

Predictive Modelling with SpatialML

set.seed(1234)
grf_ad <- grf(formula = resale_price ~ floor_area_sqm + storey_order +
                       remaining_lease_mths + PROX_CBD + PROX_ELDERLYCARE +
                       PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL +
                       PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                       WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
                       WITHIN_1KM_PRISCH,
                     dframe=train_data_nogeom, 
                     bw=20,
                     kernel="adaptive",
                     coords=coords_train)

Predicting with the test data

grf_pred <- predict.grf(grf_ad, 
                           test_data_nogeom, 
                           x.var.name="X",
                           y.var.name="Y", 
                           local.w=1,
                           global.w=0)

Saving the predicted values

Convert the output from grf_pred into a data.frame.

grf_pred_df <- as.data.frame(grf_pred)

Model Comparison

test_data_pred <- test_data %>%
  select(resale_price) %>%
  cbind(gwr_pred_df) %>%
  cbind(rf_pred_df) %>%
  cbind(grf_pred_df)

Transposing Data

test_longer <- test_data_pred %>%
  st_drop_geometry() %>%
  pivot_longer(cols = ends_with("pred"),
               names_to = "model",
               values_to = "predicted")

Renaming

model_labels <- c(
  gwr_pred = "gwr",
  rf_pred = "Random Forest",
  grf_pred = "gwRF"
)

test_longer <- test_longer %>%
  mutate(model = recode (
    model, !!!model_labels))

Computing rmse

rmse_longer <- test_longer %>%
  group_by(model) %>%
  rmse(truth = resale_price,
       estimate = predicted) %>%
  rename(rmse = .estimate) %>%
  select(model, rmse)
rmse_results <- rmse_longer

Model Comparison Plots

ggplot(rmse_results,
       aes(x = reorder(model, rmse),
           y = rmse,
           fill = "skyblue")) +
  geom_bar(stat = "identity",
           fill = "skyblue",
           color = "black",
           width = 0.7) +
  labs(title = "RMSE Comparison of Models",
       y = "RMSE",
       x = "Model") +
  theme_minimal()

Scatter Plots

test_longer <- test_longer %>%
  left_join(rmse_results,
            by = "model")

ggplot(data = test_longer,
       aes(x = predicted,
           y = resale_price)) +
  facet_wrap(~ model) +
  geom_point() +
  geom_text(data = test_longer,
            aes(x = Inf, y = Inf,
                label = paste("RMSE:", round(rmse,2))),
            hjust = 1.1, vjust = 1.1,
            color = "black", size=4)

Variable importance

var_imp <- data.frame(
  Variable = names(grf_ad$Global.Model$variable.importance),
  Importance = grf_ad$Global.Model$variable.importance
)
ggplot(var_imp, aes(x = reorder(Variable, Importance),
                    y = Importance)) +
  geom_bar(stat = "identity",
           fill = "skyblue") +
  coord_flip() +
  labs(
    title = "Variable IMportance from Ranger Model",
    x = "Variable",
    y = "Importance"
  ) +
    theme_minimal()