Published

March 7, 2024

Code
require(patchwork)
require(httr)
require(glue)
require(ineq)
require(here)
require(skimr)
require(magrittr)
require(tidyverse)

old_theme <- theme_set(theme_minimal())

Naming babies

French data

The French data are built and made available by INSEE (French Governement Statistics Institute)

This dataset has been growing for a while. It has been considered by social scientists for decades. Given names are meant to give insights into a variety of phenomena, including religious observance.

A glimpse at that body of work can be found in L’archipel français by Jérome Fourquet, Le Seuil, 2019

Read the File documentation

Code
path_data <- 'DATA'
fname <- 'nat2021_csv.zip'
fpath <- here(path_data, fname)
if (!file.exists(fpath)){
  url <- "https://www.insee.fr/fr/statistiques/fichier/2540004/nat2021_csv.zip"
  download.file(url, fpath)
}   

df_fr <- readr::read_csv2(fpath)

df_fr |> glimpse()
Rows: 686,538
Columns: 4
$ sexe     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ preusuel <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENO…
$ annais   <chr> "1900", "1901", "1902", "1903", "1904", "1905", "1906", "1907…
$ nombre   <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 1…

US data

US data may be gathered from

Baby Names USA from 1910 to 2021 (SSA)

See https://www.ssa.gov/oact/babynames/background.html

It can also be obtained by installing and loading the “babynames” package.

Full baby name data provided by the SSA. This includes all names with at least 5 uses.

Code
if (!require("babynames")){
  install.packages("babynames")
  stopifnot(require("babynames"), "Couldn't install and load package 'babynames'")
}
Code
?babynames

Tidy the French data

Rename columns according to the next lookup table:

Code
lkp <- list(year="annais",
  sex="sexe",
  name="preusuel",
  n="nombre")
Code
df_fr <- df_fr |>
  rename(!!!lkp) |>
  mutate(country='fr') |>
  mutate(sex=as_factor(sex)) |>
  mutate(sex=fct_recode(sex, "M"="1", "F"="2")) |>
  mutate(sex=fct_relevel(sex, "F", "M")) |> 
  mutate(year=ifelse(year=="XXXX", NA, year)) |>
  mutate(year=as.integer(year)) 
  
df_fr  |>
  sample(5) |>
  glimpse()
1
!!! (bang-bang-bang) is offered by rlang package. Here, we use it to perform list unpacking (with the same intent and purposes we use dictionary unpacking in Python)
Rows: 686,538
Columns: 5
$ name    <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOM…
$ country <chr> "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "f…
$ year    <int> 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 19…
$ n       <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 16…
$ sex     <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M,…

Download ‘Naissances totales par sexe’ from URL https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls from INED.

Code
births_fr_path <- here(path_data, 't35.fr.xls')
births_fr_url <- 'https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls'

if (!file.exists(births_fr_path)) {
  download.file(births_fr_url, births_fr_path)
}
Code
births_fr <-  readxl::read_excel(births_fr_path, skip = 3)

births_fr <- births_fr[-1, ] 


births_fr |> 
  glimpse()
Rows: 130
Columns: 10
$ `Répartition par sexe et vie`                <chr> "1901", "1902", "1903", "…
$ `Ensemble des nés vivants`                   <dbl> 917075, 904434, 884498, 8…
$ `Nés vivants - Garçons`                      <dbl> 468125, 462097, 451510, 4…
$ `Nés vivants - Filles`                       <dbl> 448950, 442337, 432988, 4…
$ `Ensemble des enfants sans vie`              <dbl> 32410, 32000, 31076, 3067…
$ `Enfants sans vie - Garçons`                 <chr> "18522", "18172", "17875"…
$ `Enfants sans vie - Filles`                  <chr> "13888", "13828", "13201"…
$ `Garçons vivants pour 100 nés\nvivants`      <dbl> 51.0, 51.1, 51.0, 51.0, 5…
$ `Garçons vivants pour 100\nfilles vivantes`  <dbl> 104.3, 104.5, 104.3, 104.…
$ `Garçons sans vie pour 100\nfilles sans vie` <chr> "133.40000000000001", "13…

If you have problems with the excel reader, feel free to download an equivalent csv file from url

Code
names(births_fr)[1] <- "year"
Code
births_fr <- births_fr |>
  mutate(year=as.integer(year)) |>
  drop_na()
  

births_fr |>
  ggplot() +
  aes(x=year, y=`Ensemble des nés vivants`) +
  geom_col() +
  labs(title="Births in France")

Tidy the American data

Code
babynames <- babynames |>
  mutate(country='us') |>
  mutate(sex=as_factor(sex))
  
babynames |>
  glimpse()
Rows: 1,924,665
Columns: 6
$ year    <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 18…
$ sex     <fct> F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
$ name    <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret", "Id…
$ n       <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 1288, 12…
$ prop    <dbl> 0.07238359, 0.02667896, 0.02052149, 0.01986579, 0.01788843, 0.…
$ country <chr> "us", "us", "us", "us", "us", "us", "us", "us", "us", "us", "u…
Code
births_us <- births

births_us  |> 
  ggplot() +
  aes(x=year, y=births) +
  geom_col() +
  labs(title="Births in USA")

Sex ratios

Question

In dataset df_fr compute the total number of reported male and female births per year. Compute and plot the sex ratio.

Code
df_accounted_births_fr <- df_fr |>
  group_by(year, sex) |>
  summarise(n=sum(n)) 
  
df_accounted_births_fr |>
  glimpse()
Rows: 246
Columns: 3
Groups: year [123]
$ year <int> 1900, 1900, 1901, 1901, 1902, 1902, 1903, 1903, 1904, 1904, 1905,…
$ sex  <fct> F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M,…
$ n    <dbl> 237653, 177387, 257492, 195964, 261437, 204354, 261450, 207360, 2…
Code
df_app_sex_ratio_fr <- df_accounted_births_fr |>
  pivot_wider(id_cols=year, 
              names_from=sex, 
              values_from=`n`) |>
  mutate(`Garçons vivants pour 100\nfilles vivantes`=100*M/F)
              
df_app_sex_ratio_fr |>
  glimpse()
Rows: 123
Columns: 4
Groups: year [123]
$ year                                        <int> 1900, 1901, 1902, 1903, 19…
$ F                                           <dbl> 237653, 257492, 261437, 26…
$ M                                           <dbl> 177387, 195964, 204354, 20…
$ `Garçons vivants pour 100\nfilles vivantes` <dbl> 74.64118, 76.10489, 78.165…
Code
p_app_sex_ratio_fr <- df_app_sex_ratio_fr |>
  ggplot() +
  aes(x=year, y=`Garçons vivants pour 100\nfilles vivantes`) +
  geom_col() +
  theme_minimal()
  
  
p_app_sex_ratio_fr  +
  labs(
    title="France: Apparent sex ratio",
    subtitle="Dataset: 'nat2021_csv' (INSEE)"
  ) 
1
Should not be necessary

Question

Compare with sex ratio as given in dataset from INED

Code
p_sex_ratio_fr <- p_app_sex_ratio_fr %+% 
  births_fr 

p_sex_ratio_fr + labs(
    title="France: sex ratio",
    subtitle="Dataset INED") 

Code
(p_app_sex_ratio_fr + p_sex_ratio_fr) +
  plot_annotation(
    title="Evolution of sex ratio  at birth in France",
    subtitle="Left: INSEE data. Right: INED data"
  )