Code
require(patchwork)
require(httr)
require(glue)
require(ineq)
require(here)
require(skimr)
require(magrittr)
require(tidyverse)
old_theme <- theme_set(theme_minimal())require(patchwork)
require(httr)
require(glue)
require(ineq)
require(here)
require(skimr)
require(magrittr)
require(tidyverse)
old_theme <- theme_set(theme_minimal())The French data are built and made available by INSEE (French Governement Statistics Institute)
This dataset has been growing for a while. It has been considered by social scientists for decades. Given names are meant to give insights into a variety of phenomena, including religious observance.
A glimpse at that body of work can be found in L’archipel français by Jérome Fourquet, Le Seuil, 2019
Read the File documentation
path_data <- 'DATA'
fname <- 'nat2021_csv.zip'
fpath <- here(path_data, fname)
if (!file.exists(fpath)){
url <- "https://www.insee.fr/fr/statistiques/fichier/2540004/nat2021_csv.zip"
download.file(url, fpath)
}
df_fr <- readr::read_csv2(fpath)
df_fr |> glimpse()Rows: 686,538
Columns: 4
$ sexe <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ preusuel <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENO…
$ annais <chr> "1900", "1901", "1902", "1903", "1904", "1905", "1906", "1907…
$ nombre <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 1…
US data may be gathered from
Baby Names USA from 1910 to 2021 (SSA)
See https://www.ssa.gov/oact/babynames/background.html
It can also be obtained by installing and loading the “babynames” package.
Full baby name data provided by the SSA. This includes all names with at least 5 uses.
if (!require("babynames")){
install.packages("babynames")
stopifnot(require("babynames"), "Couldn't install and load package 'babynames'")
}?babynamesRename columns according to the next lookup table:
lkp <- list(year="annais",
sex="sexe",
name="preusuel",
n="nombre")df_fr <- df_fr |>
rename(!!!lkp) |>
mutate(country='fr') |>
mutate(sex=as_factor(sex)) |>
mutate(sex=fct_recode(sex, "M"="1", "F"="2")) |>
mutate(sex=fct_relevel(sex, "F", "M")) |>
mutate(year=ifelse(year=="XXXX", NA, year)) |>
mutate(year=as.integer(year))
df_fr |>
sample(5) |>
glimpse()!!! (bang-bang-bang) is offered by rlang package. Here, we use it to perform list unpacking (with the same intent and purposes we use dictionary unpacking in Python)
Rows: 686,538
Columns: 5
$ name <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOM…
$ country <chr> "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "f…
$ year <int> 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 19…
$ n <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 16…
$ sex <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M,…
Download ‘Naissances totales par sexe’ from URL https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls from INED.
births_fr_path <- here(path_data, 't35.fr.xls')
births_fr_url <- 'https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls'
if (!file.exists(births_fr_path)) {
download.file(births_fr_url, births_fr_path)
}births_fr <- readxl::read_excel(births_fr_path, skip = 3)
births_fr <- births_fr[-1, ]
births_fr |>
glimpse()Rows: 130
Columns: 10
$ `Répartition par sexe et vie` <chr> "1901", "1902", "1903", "…
$ `Ensemble des nés vivants` <dbl> 917075, 904434, 884498, 8…
$ `Nés vivants - Garçons` <dbl> 468125, 462097, 451510, 4…
$ `Nés vivants - Filles` <dbl> 448950, 442337, 432988, 4…
$ `Ensemble des enfants sans vie` <dbl> 32410, 32000, 31076, 3067…
$ `Enfants sans vie - Garçons` <chr> "18522", "18172", "17875"…
$ `Enfants sans vie - Filles` <chr> "13888", "13828", "13201"…
$ `Garçons vivants pour 100 nés\nvivants` <dbl> 51.0, 51.1, 51.0, 51.0, 5…
$ `Garçons vivants pour 100\nfilles vivantes` <dbl> 104.3, 104.5, 104.3, 104.…
$ `Garçons sans vie pour 100\nfilles sans vie` <chr> "133.40000000000001", "13…
If you have problems with the excel reader, feel free to download an equivalent csv file from url
names(births_fr)[1] <- "year"births_fr <- births_fr |>
mutate(year=as.integer(year)) |>
drop_na()
births_fr |>
ggplot() +
aes(x=year, y=`Ensemble des nés vivants`) +
geom_col() +
labs(title="Births in France")babynames <- babynames |>
mutate(country='us') |>
mutate(sex=as_factor(sex))
babynames |>
glimpse()Rows: 1,924,665
Columns: 6
$ year <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 18…
$ sex <fct> F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
$ name <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret", "Id…
$ n <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 1288, 12…
$ prop <dbl> 0.07238359, 0.02667896, 0.02052149, 0.01986579, 0.01788843, 0.…
$ country <chr> "us", "us", "us", "us", "us", "us", "us", "us", "us", "us", "u…
births_us <- births
births_us |>
ggplot() +
aes(x=year, y=births) +
geom_col() +
labs(title="Births in USA")In dataset df_fr compute the total number of reported male and female births per year. Compute and plot the sex ratio.
df_accounted_births_fr <- df_fr |>
group_by(year, sex) |>
summarise(n=sum(n))
df_accounted_births_fr |>
glimpse()Rows: 246
Columns: 3
Groups: year [123]
$ year <int> 1900, 1900, 1901, 1901, 1902, 1902, 1903, 1903, 1904, 1904, 1905,…
$ sex <fct> F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M,…
$ n <dbl> 237653, 177387, 257492, 195964, 261437, 204354, 261450, 207360, 2…
df_app_sex_ratio_fr <- df_accounted_births_fr |>
pivot_wider(id_cols=year,
names_from=sex,
values_from=`n`) |>
mutate(`Garçons vivants pour 100\nfilles vivantes`=100*M/F)
df_app_sex_ratio_fr |>
glimpse()Rows: 123
Columns: 4
Groups: year [123]
$ year <int> 1900, 1901, 1902, 1903, 19…
$ F <dbl> 237653, 257492, 261437, 26…
$ M <dbl> 177387, 195964, 204354, 20…
$ `Garçons vivants pour 100\nfilles vivantes` <dbl> 74.64118, 76.10489, 78.165…
p_app_sex_ratio_fr <- df_app_sex_ratio_fr |>
ggplot() +
aes(x=year, y=`Garçons vivants pour 100\nfilles vivantes`) +
geom_col() +
theme_minimal()
p_app_sex_ratio_fr +
labs(
title="France: Apparent sex ratio",
subtitle="Dataset: 'nat2021_csv' (INSEE)"
) Compare with sex ratio as given in dataset from INED
p_sex_ratio_fr <- p_app_sex_ratio_fr %+%
births_fr
p_sex_ratio_fr + labs(
title="France: sex ratio",
subtitle="Dataset INED") (p_app_sex_ratio_fr + p_sex_ratio_fr) +
plot_annotation(
title="Evolution of sex ratio at birth in France",
subtitle="Left: INSEE data. Right: INED data"
)