name: layout-general layout: true class: left, middle <style> .remark-slide-number { position: inherit; } .remark-slide-number .progress-bar-container { position: absolute; bottom: 0; height: 4px; display: block; left: 0; right: 0; } .remark-slide-number .progress-bar { height: 100%; background-color: red; } /* custom.css */ .plot-callout { width: 300px; bottom: 5%; right: 5%; position: absolute; padding: 0px; z-index: 100; } .plot-callout img { width: 100%; border: 1px solid #23373B; } </style>
--- name: inter-slide class: left, middle, inverse {{content}} --- class: middle, left, inverse # Analyse des Données : PCA illustrated ### 2024-03-05 #### [Master I MIDS & MFA]() #### [Analyse Exploratoire de Données](http://stephane-v-boucheron.fr/courses/eda/) #### [Stéphane Boucheron](http://stephane-v-boucheron.fr) --- ### Warm up ```r pacman::p_load(tidyverse) pacman::p_load(glue) pacman::p_load(tidyr) pacman::p_load(imager) pacman::p_load(here) pacman::p_load(plotly) img <- load.image(here("img", "iris-fleur-histoire.jpeg")) str(img) ``` ``` ## 'cimg' num [1:560, 1:373, 1, 1:3] 0.0902 0.0941 0.098 0.098 0.098 ... ``` ```r dim(img) ``` ``` ## [1] 560 373 1 3 ``` --- ![](img/iris-fleur-histoire.jpeg) --- ### Make it a dataframe ```r img_df_long <- img %>% grayscale(method = "Luma", drop = TRUE) %>% as.data.frame() head(img_df_long) ``` ``` ## x y value ## 1 1 1 0.1448627 ## 2 2 1 0.1487843 ## 3 3 1 0.1527059 ## 4 4 1 0.1527059 ## 5 5 1 0.1527059 ## 6 6 1 0.1566275 ``` --- ### Make it black and white .fl.w-50.pa2.f6[ ```r p_bw <- img_df_long %>% ggplot() + aes(x = x, y = y, fill = value) %>% geom_raster() + scale_y_reverse() + scale_fill_gradient(low = "black", high = "white") + guides(fill = "none") + coord_fixed() + theme_minimal() ``` ] .fl.w-50.pa2.f6[ <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-6-1.png" width="2100" /> ] --- ### Make it wide (and forget about two channels) .fl.w-50.pa2.f6[ ```r img_df <- img_df_long %>% tidyr::pivot_wider(names_from = y, values_from = value) img_df %>% head() ``` ] .fl.w-50.pa2.f6[ ``` ## # A tibble: 6 × 374 ## x `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11` `12` ## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 1 0.145 0.145 0.149 0.153 0.157 0.161 0.165 0.165 0.161 0.165 0.169 0.169 ## 2 2 0.149 0.153 0.153 0.153 0.157 0.157 0.161 0.161 0.165 0.165 0.165 0.169 ## 3 3 0.153 0.153 0.157 0.157 0.158 0.162 0.161 0.161 0.165 0.165 0.165 0.165 ## 4 4 0.153 0.153 0.157 0.157 0.162 0.162 0.165 0.165 0.165 0.165 0.165 0.165 ## 5 5 0.153 0.153 0.157 0.157 0.162 0.162 0.165 0.165 0.161 0.165 0.165 0.169 ## 6 6 0.157 0.157 0.157 0.157 0.158 0.162 0.161 0.161 0.161 0.165 0.169 0.169 ## # ℹ 361 more variables: `13` <dbl>, `14` <dbl>, `15` <dbl>, `16` <dbl>, ## # `17` <dbl>, `18` <dbl>, `19` <dbl>, `20` <dbl>, `21` <dbl>, `22` <dbl>, ## # `23` <dbl>, `24` <dbl>, `25` <dbl>, `26` <dbl>, `27` <dbl>, `28` <dbl>, ## # `29` <dbl>, `30` <dbl>, `31` <dbl>, `32` <dbl>, `33` <dbl>, `34` <dbl>, ## # `35` <dbl>, `36` <dbl>, `37` <dbl>, `38` <dbl>, `39` <dbl>, `40` <dbl>, ## # `41` <dbl>, `42` <dbl>, `43` <dbl>, `44` <dbl>, `45` <dbl>, `46` <dbl>, ## # `47` <dbl>, `48` <dbl>, `49` <dbl>, `50` <dbl>, `51` <dbl>, `52` <dbl>, … ``` ] --- ### Perform PCA .fl.w-50.pa2.f6[ ```r img_pca <- img_df %>% dplyr::select(-x) %>% prcomp(scale = TRUE, center = TRUE) img_pca %>% broom::tidy(matrix="d") %>% head() ``` ] .fl.w-50.pa2.f6[ ``` ## # A tibble: 6 × 4 ## PC std.dev percent cumulative ## <dbl> <dbl> <dbl> <dbl> ## 1 1 12.9 0.444 0.444 ## 2 2 9.28 0.231 0.675 ## 3 3 6.16 0.102 0.777 ## 4 4 4.73 0.0601 0.837 ## 5 5 3.59 0.0345 0.871 ## 6 6 3.18 0.0271 0.898 ``` ] --- ### Make it an image again .f6[ ```r reverse_pca <- function(n_comp = 20, pca_object = img_pca){ # [From Kieran Healy] recon <- pca_object$x[, 1:n_comp] %*% t(pca_object$rotation[, 1:n_comp]) if(all(pca_object$scale != FALSE)){ recon <- scale(recon, center = FALSE, scale = 1/pca_object$scale) } if(all(pca_object$center != FALSE)){ recon <- scale(recon, scale = FALSE, center = -1 * pca_object$center) } recon_df <- data.frame(cbind(1:nrow(recon), recon)) colnames(recon_df) <- c("x", 1:(ncol(recon_df)-1)) recon_df_long <- recon_df %>% tidyr::pivot_longer(cols = -x, names_to = "y", values_to = "value") %>% mutate(y = as.numeric(y)) %>% arrange(y) %>% as.data.frame() recon_df_long } ``` ] .plot-callout[ [From Kieran Healy](https://kieranhealy.org/blog/archives/2019/10/27/reconstructing-images-using-pca/) ] --- ### Building a sequence of approximations ```r n_pcs <- c(2, 5, 10, seq(20, 100, 20), seq(150, 300, 50)) names(n_pcs) <- n_pcs ## map reverse_pca() recovered_imgs <- map_dfr(n_pcs, reverse_pca, .id = "pcs") %>% mutate(pcs = as.integer(pcs)) ``` --- ### Quality of reconstruction with respect to rank <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-9-1.png" width="2100" /> --- ### Paving the way for reconstructions ```r q <- recovered_imgs %>% filter(pcs==2) %>% ggplot() + aes(x = x, y = y, fill = value, frame=pcs) q_out <- q + geom_raster() + scale_y_reverse() + scale_fill_gradient(low = "black", high = "white") + guides(fill = "none") + coord_fixed() + labs(title = glue("Recovering the content of an {nrow(img_df)}x{ncol(img_df)} pixel image\nfrom a PCA of its pixels")) + theme_minimal() ``` --- ### Reconstructing from 2 components <img src="cm-6-EDA-bis_files/figure-html/approx_reconstruction-1.png" width="90%" /> Relative reconstruction error : 32.49 `\(\%\)` --- ### Reconstructing from 5 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-11-1.png" width="90%" /> Relative reconstruction error : 12.87 `\(\%\)` --- ### Reconstructing from 10 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-12-1.png" width="90%" /> Relative reconstruction error : 6.41 `\(\%\)` --- ### Reconstructing from 20 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-13-1.png" width="90%" /> Relative reconstruction error : 3.43 `\(\%\)` --- ### Reconstructing from 40 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-14-1.png" width="90%" /> Relative reconstruction error : 1.61 `\(\%\)` --- ### Reconstructing from 60 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-15-1.png" width="90%" /> Relative reconstruction error : 0.9 `\(\%\)` --- ### Reconstructing from 80 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-16-1.png" width="90%" /> Relative reconstruction error : 0.55 `\(\%\)` --- ### Reconstructing from 100 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-17-1.png" width="90%" /> Relative reconstruction error : 0.34 `\(\%\)` --- ### Reconstructing from 150 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-18-1.png" width="90%" /> Relative reconstruction error : 0.11 `\(\%\)` --- ### Reconstructing from 200 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-19-1.png" width="90%" /> Relative reconstruction error : 0.03 `\(\%\)` --- ### Reconstructing from 250 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-20-1.png" width="90%" /> Relative reconstruction error : 0.01 `\(\%\)` --- ### Reconstructing from 300 components <img src="cm-6-EDA-bis_files/figure-html/unnamed-chunk-21-1.png" width="90%" /> Relative reconstruction error : 0 `\(\%\)` --- exclude: true ### Working with colors ```r list_img_pca <- img_df_long %>% group_by(cc) %>% group_map(.f = ~ {pivot_wider(.x, names_from = `y`, values_from = `value`)}, .keep = FALSE) %>% map(.f = ~ (select(.x, -x) %>% prcomp(center=TRUE, scale.=TRUE))) ``` ```r reverse_pca_color <- function(n_comp = 20, list_pca_objects = list_img_pca){ # [From Kieran Healy] # recon <- pca_object$x[, 1:n_comp] %*% t(pca_object$rotation[, 1:n_comp]) list_recons <- list_pca_objects %>% map(.f = ~ .x$x[, 1:n_comp] %*% t(.x$rotation[, 1:n_comp])) # rescale_recons <- function(pca_object, recon) { # # if(all(pca_object$scale != FALSE)){ # recon <- scale(recon, # center = FALSE, # scale = 1/pca_object$scale) # } # # } list_recons <- map2(.x =list_pca_objects, .y =list_recons, .f = ~ if(all(.x$scale != FALSE)){ scale(.y, center = FALSE, scale = 1/.x$scale) } else {.y} ) list_recons <- map2(.x =list_pca_objects, .y =list_recons, .f = ~ if(all(.x$center != FALSE)) { scale(.y, scale = FALSE, center = -1 * .x$center) } else { .y } ) # if(all(pca_object$center != FALSE)){ # recon <- scale(recon, # scale = FALSE, # center = -1 * pca_object$center) # } #TODO recon_df <- data.frame(cbind(1:nrow(recon), recon)) colnames(recon_df) <- c("x", 1:(ncol(recon_df)-1)) recon_df_long <- recon_df %>% tidyr::pivot_longer(cols = -x, names_to = "y", values_to = "value") %>% mutate(y = as.numeric(y)) %>% arrange(y) %>% as.data.frame() recon_df_long } ```