Processing math: 100%
+ - 0:00:00
Notes for current slide
Notes for next slide

Analyse des Données : PCA illustrated

2024-03-05

Master I MIDS & MFA

Analyse Exploratoire de Données

Stéphane Boucheron

Warm up

pacman::p_load(tidyverse)
pacman::p_load(glue)
pacman::p_load(tidyr)
pacman::p_load(imager)
pacman::p_load(here)
pacman::p_load(plotly)
img <- load.image(here("img", "iris-fleur-histoire.jpeg"))
str(img)
## 'cimg' num [1:560, 1:373, 1, 1:3] 0.0902 0.0941 0.098 0.098 0.098 ...
dim(img)
## [1] 560 373 1 3

Make it a dataframe

img_df_long <- img %>%
grayscale(method = "Luma", drop = TRUE) %>%
as.data.frame()
head(img_df_long)
## x y value
## 1 1 1 0.1448627
## 2 2 1 0.1487843
## 3 3 1 0.1527059
## 4 4 1 0.1527059
## 5 5 1 0.1527059
## 6 6 1 0.1566275

Make it black and white

p_bw <- img_df_long %>%
ggplot() +
aes(x = x, y = y, fill = value) %>%
geom_raster() +
scale_y_reverse() +
scale_fill_gradient(low = "black", high = "white") +
guides(fill = "none") +
coord_fixed() +
theme_minimal()

Make it wide (and forget about two channels)

img_df <- img_df_long %>%
tidyr::pivot_wider(names_from = y,
values_from = value)
img_df %>%
head()
## # A tibble: 6 × 374
## x `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11` `12`
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.145 0.145 0.149 0.153 0.157 0.161 0.165 0.165 0.161 0.165 0.169 0.169
## 2 2 0.149 0.153 0.153 0.153 0.157 0.157 0.161 0.161 0.165 0.165 0.165 0.169
## 3 3 0.153 0.153 0.157 0.157 0.158 0.162 0.161 0.161 0.165 0.165 0.165 0.165
## 4 4 0.153 0.153 0.157 0.157 0.162 0.162 0.165 0.165 0.165 0.165 0.165 0.165
## 5 5 0.153 0.153 0.157 0.157 0.162 0.162 0.165 0.165 0.161 0.165 0.165 0.169
## 6 6 0.157 0.157 0.157 0.157 0.158 0.162 0.161 0.161 0.161 0.165 0.169 0.169
## # ℹ 361 more variables: `13` <dbl>, `14` <dbl>, `15` <dbl>, `16` <dbl>,
## # `17` <dbl>, `18` <dbl>, `19` <dbl>, `20` <dbl>, `21` <dbl>, `22` <dbl>,
## # `23` <dbl>, `24` <dbl>, `25` <dbl>, `26` <dbl>, `27` <dbl>, `28` <dbl>,
## # `29` <dbl>, `30` <dbl>, `31` <dbl>, `32` <dbl>, `33` <dbl>, `34` <dbl>,
## # `35` <dbl>, `36` <dbl>, `37` <dbl>, `38` <dbl>, `39` <dbl>, `40` <dbl>,
## # `41` <dbl>, `42` <dbl>, `43` <dbl>, `44` <dbl>, `45` <dbl>, `46` <dbl>,
## # `47` <dbl>, `48` <dbl>, `49` <dbl>, `50` <dbl>, `51` <dbl>, `52` <dbl>, …

Perform PCA

img_pca <- img_df %>%
dplyr::select(-x) %>%
prcomp(scale = TRUE, center = TRUE)
img_pca %>%
broom::tidy(matrix="d") %>%
head()
## # A tibble: 6 × 4
## PC std.dev percent cumulative
## <dbl> <dbl> <dbl> <dbl>
## 1 1 12.9 0.444 0.444
## 2 2 9.28 0.231 0.675
## 3 3 6.16 0.102 0.777
## 4 4 4.73 0.0601 0.837
## 5 5 3.59 0.0345 0.871
## 6 6 3.18 0.0271 0.898

Make it an image again

reverse_pca <- function(n_comp = 20, pca_object = img_pca){
# [From Kieran Healy]
recon <- pca_object$x[, 1:n_comp] %*% t(pca_object$rotation[, 1:n_comp])
if(all(pca_object$scale != FALSE)){
recon <- scale(recon,
center = FALSE,
scale = 1/pca_object$scale)
}
if(all(pca_object$center != FALSE)){
recon <- scale(recon,
scale = FALSE,
center = -1 * pca_object$center)
}
recon_df <- data.frame(cbind(1:nrow(recon), recon))
colnames(recon_df) <- c("x", 1:(ncol(recon_df)-1))
recon_df_long <- recon_df %>%
tidyr::pivot_longer(cols = -x,
names_to = "y",
values_to = "value") %>%
mutate(y = as.numeric(y)) %>%
arrange(y) %>%
as.data.frame()
recon_df_long
}

Building a sequence of approximations

n_pcs <- c(2, 5, 10, seq(20, 100, 20), seq(150, 300, 50))
names(n_pcs) <- n_pcs
## map reverse_pca()
recovered_imgs <- map_dfr(n_pcs,
reverse_pca,
.id = "pcs") %>%
mutate(pcs = as.integer(pcs))

Quality of reconstruction with respect to rank

Paving the way for reconstructions

q <- recovered_imgs %>%
filter(pcs==2) %>%
ggplot() +
aes(x = x, y = y, fill = value, frame=pcs)
q_out <- q +
geom_raster() +
scale_y_reverse() +
scale_fill_gradient(low = "black", high = "white") +
guides(fill = "none") +
coord_fixed() +
labs(title = glue("Recovering the content of an {nrow(img_df)}x{ncol(img_df)} pixel image\nfrom a PCA of its pixels")) +
theme_minimal()

Reconstructing from 2 components

Relative reconstruction error : 32.49 %

Reconstructing from 5 components

Relative reconstruction error : 12.87 %

Reconstructing from 10 components

Relative reconstruction error : 6.41 %

Reconstructing from 20 components

Relative reconstruction error : 3.43 %

Reconstructing from 40 components

Relative reconstruction error : 1.61 %

Reconstructing from 60 components

Relative reconstruction error : 0.9 %

Reconstructing from 80 components

Relative reconstruction error : 0.55 %

Reconstructing from 100 components

Relative reconstruction error : 0.34 %

Reconstructing from 150 components

Relative reconstruction error : 0.11 %

Reconstructing from 200 components

Relative reconstruction error : 0.03 %

Reconstructing from 250 components

Relative reconstruction error : 0.01 %

Reconstructing from 300 components

Relative reconstruction error : 0 %

Analyse des Données : PCA illustrated

2024-03-05

Master I MIDS & MFA

Analyse Exploratoire de Données

Stéphane Boucheron

Paused

Help

Keyboard shortcuts

, , Pg Up, k Go to previous slide
, , Pg Dn, Space, j Go to next slide
Home Go to first slide
End Go to last slide
Number + Return Go to specific slide
b / m / f Toggle blackout / mirrored / fullscreen mode
c Clone slideshow
p Toggle presenter mode
t Restart the presentation timer
?, h Toggle this help
oTile View: Overview of Slides
Esc Back to slideshow