Published

March 7, 2024

Code

require(patchwork)
require(httr)
require(glue)
require(ineq)
require(here)
require(skimr)
require(magrittr)
require(tidyverse)

old_theme <- theme_set(theme_minimal())

L3 MIASHS
Université Paris Cité
Année 2023-2024
Course Homepage
Moodle

Naming babies

French data

The French data are built and made available by INSEE (French Governement Statistics Institute)

https://www.insee.fr/fr/statistiques/fichier/2540004/nat2021_csv.zip

This dataset has been growing for a while. It has been considered by social scientists for decades. Given names are meant to give insights into a variety of phenomena, including religious observance.

A glimpse at that body of work can be found in L’archipel français by Jérome Fourquet, Le Seuil, 2019

Read the File documentation

Code

path_data <- 'DATA'
fname <- 'nat2021_csv.zip'
fpath <- here(path_data, fname)
if (!file.exists(fpath)){
  url <- "https://www.insee.fr/fr/statistiques/fichier/2540004/nat2021_csv.zip"
  download.file(url, fpath)
}   

df_fr <- readr::read_csv2(fpath)

df_fr |> glimpse()

Rows: 686,538
Columns: 4
$ sexe     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ preusuel <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENO…
$ annais   <chr> "1900", "1901", "1902", "1903", "1904", "1905", "1906", "1907…
$ nombre   <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 1…

US data

US data may be gathered from

Baby Names USA from 1910 to 2021 (SSA)

See https://www.ssa.gov/oact/babynames/background.html

It can also be obtained by installing and loading the “babynames” package.

Full baby name data provided by the SSA. This includes all names with at least 5 uses.

Code

if (!require("babynames")){
  install.packages("babynames")
  stopifnot(require("babynames"), "Couldn't install and load package 'babynames'")
}

Code

?babynames

Tidy the French data

Rename columns according to the next lookup table:

Code

lkp <- list(year="annais",
  sex="sexe",
  name="preusuel",
  n="nombre")

Code

df_fr <- df_fr |>
  rename(!!!lkp) |>
  mutate(country='fr') |>
  mutate(sex=as_factor(sex)) |>
  mutate(sex=fct_recode(sex, "M"="1", "F"="2")) |>
  mutate(sex=fct_relevel(sex, "F", "M")) |> 
  mutate(year=ifelse(year=="XXXX", NA, year)) |>
  mutate(year=as.integer(year)) 
  
df_fr  |>
  sample(5) |>
  glimpse()

1: !!! (bang-bang-bang) is offered by rlang package. Here, we use it to perform list unpacking (with the same intent and purposes we use dictionary unpacking in Python)

Rows: 686,538
Columns: 5
$ name    <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOM…
$ country <chr> "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "f…
$ year    <int> 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 19…
$ n       <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 16…
$ sex     <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M,…

Download ‘Naissances totales par sexe’ from URL https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls from INED.

Code

births_fr_path <- here(path_data, 't35.fr.xls')
births_fr_url <- 'https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls'

if (!file.exists(births_fr_path)) {
  download.file(births_fr_url, births_fr_path)
}

Code

births_fr <-  readxl::read_excel(births_fr_path, skip = 3)

births_fr <- births_fr[-1, ] 


births_fr |> 
  glimpse()

Rows: 130
Columns: 10
$ `Répartition par sexe et vie`                <chr> "1901", "1902", "1903", "…
$ `Ensemble des nés vivants`                   <dbl> 917075, 904434, 884498, 8…
$ `Nés vivants - Garçons`                      <dbl> 468125, 462097, 451510, 4…
$ `Nés vivants - Filles`                       <dbl> 448950, 442337, 432988, 4…
$ `Ensemble des enfants sans vie`              <dbl> 32410, 32000, 31076, 3067…
$ `Enfants sans vie - Garçons`                 <chr> "18522", "18172", "17875"…
$ `Enfants sans vie - Filles`                  <chr> "13888", "13828", "13201"…
$ `Garçons vivants pour 100 nés\nvivants`      <dbl> 51.0, 51.1, 51.0, 51.0, 5…
$ `Garçons vivants pour 100\nfilles vivantes`  <dbl> 104.3, 104.5, 104.3, 104.…
$ `Garçons sans vie pour 100\nfilles sans vie` <chr> "133.40000000000001", "13…

If you have problems with the excel reader, feel free to download an equivalent csv file from url

Code

names(births_fr)[1] <- "year"

Code

births_fr <- births_fr |>
  mutate(year=as.integer(year)) |>
  drop_na()
  

births_fr |>
  ggplot() +
  aes(x=year, y=`Ensemble des nés vivants`) +
  geom_col() +
  labs(title="Births in France")

Tidy the American data

Code

babynames <- babynames |>
  mutate(country='us') |>
  mutate(sex=as_factor(sex))
  
babynames |>
  glimpse()

Rows: 1,924,665
Columns: 6
$ year    <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 18…
$ sex     <fct> F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
$ name    <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret", "Id…
$ n       <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 1288, 12…
$ prop    <dbl> 0.07238359, 0.02667896, 0.02052149, 0.01986579, 0.01788843, 0.…
$ country <chr> "us", "us", "us", "us", "us", "us", "us", "us", "us", "us", "u…

Code

births_us <- births

births_us  |> 
  ggplot() +
  aes(x=year, y=births) +
  geom_col() +
  labs(title="Births in USA")

Sex ratios

Question

In dataset df_fr compute the total number of reported male and female births per year. Compute and plot the sex ratio.

Code

df_accounted_births_fr <- df_fr |>
  group_by(year, sex) |>
  summarise(n=sum(n)) 
  
df_accounted_births_fr |>
  glimpse()

Rows: 246
Columns: 3
Groups: year [123]
$ year <int> 1900, 1900, 1901, 1901, 1902, 1902, 1903, 1903, 1904, 1904, 1905,…
$ sex  <fct> F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M,…
$ n    <dbl> 237653, 177387, 257492, 195964, 261437, 204354, 261450, 207360, 2…

Code

df_app_sex_ratio_fr <- df_accounted_births_fr |>
  pivot_wider(id_cols=year, 
              names_from=sex, 
              values_from=`n`) |>
  mutate(`Garçons vivants pour 100\nfilles vivantes`=100*M/F)
              
df_app_sex_ratio_fr |>
  glimpse()

Rows: 123
Columns: 4
Groups: year [123]
$ year                                        <int> 1900, 1901, 1902, 1903, 19…
$ F                                           <dbl> 237653, 257492, 261437, 26…
$ M                                           <dbl> 177387, 195964, 204354, 20…
$ `Garçons vivants pour 100\nfilles vivantes` <dbl> 74.64118, 76.10489, 78.165…

Code

p_app_sex_ratio_fr <- df_app_sex_ratio_fr |>
  ggplot() +
  aes(x=year, y=`Garçons vivants pour 100\nfilles vivantes`) +
  geom_col() +
  theme_minimal()
  
  
p_app_sex_ratio_fr  +
  labs(
    title="France: Apparent sex ratio",
    subtitle="Dataset: 'nat2021_csv' (INSEE)"
  )

1: Should not be necessary

Question

Compare with sex ratio as given in dataset from INED

Code

p_sex_ratio_fr <- p_app_sex_ratio_fr %+% 
  births_fr 

p_sex_ratio_fr + labs(
    title="France: sex ratio",
    subtitle="Dataset INED")

Code

(p_app_sex_ratio_fr + p_sex_ratio_fr) +
  plot_annotation(
    title="Evolution of sex ratio  at birth in France",
    subtitle="Left: INSEE data. Right: INED data"
  )

Code

df_app_sex_ratio_fr |>
  inner_join(births_fr, by="year") |>
  glimpse()

Rows: 121
Columns: 13
Groups: year [121]
$ year                                          <int> 1901, 1902, 1903, 1904, …
$ F                                             <dbl> 257492, 261437, 261450, …
$ M                                             <dbl> 195964, 204354, 207360, …
$ `Garçons vivants pour 100\nfilles vivantes.x` <dbl> 76.10489, 78.16568, 79.3…
$ `Ensemble des nés vivants`                    <dbl> 917075, 904434, 884498, …
$ `Nés vivants - Garçons`                       <dbl> 468125, 462097, 451510, …
$ `Nés vivants - Filles`                        <dbl> 448950, 442337, 432988, …
$ `Ensemble des enfants sans vie`               <dbl> 32410, 32000, 31076, 306…
$ `Enfants sans vie - Garçons`                  <chr> "18522", "18172", "17875…
$ `Enfants sans vie - Filles`                   <chr> "13888", "13828", "13201…
$ `Garçons vivants pour 100 nés\nvivants`       <dbl> 51.0, 51.1, 51.0, 51.0, …
$ `Garçons vivants pour 100\nfilles vivantes.y` <dbl> 104.3, 104.5, 104.3, 104…
$ `Garçons sans vie pour 100\nfilles sans vie`  <chr> "133.40000000000001", "1…

Code

df_app_sex_ratio_fr |>
  inner_join(births_fr, by="year") |>
  ggplot() +
  aes(x=year, y=`Garçons vivants pour 100\nfilles vivantes.x`/`Garçons vivants pour 100\nfilles vivantes.y`) +
  geom_point(size=.5) +
  scale_y_log10() +
  ylab('Ratio between apparent sex ratio and\n exact sex ratio') +
  labs(
    title="French data, confronting INSEE and INED data"
  )

Question

Consider the fluctuations of the sex ratio through the years.

Are they consistent with the hypothesis: the sex of newborns are independently. identically distributed with the probability of getting a girl equal to \(.48\)?

Question

Consider again the fluctuations of the sex ratio through the years.

Assume that for each year the sex of newborns are independently. identically distributed with the probability of getting a girl depending on the year.

Are the data consistent with the hypothesis: the probability of getting a girl remains constant thoughout the years?

Picturing concentration of babynames distributions

Every year, in each country, for both sex, the name counts define a discrete probability distribution over the set of names (the universe).

This distribution, just as an income or wealth distribution, is (usually) far from being uniform. We want to assess how uneven it is.

We use the tools developed in econometrics.

Without loss of generality, we assume that we handle a distribution over positive integers \(1, \ldots, n\) where \(n\) is the number of distinct names given during a year.

We assume that frequencies \(p_1, p_2, \ldots, p_n\) are given in ascending order, ties are broken arbitrarily.

The Lorenz function (Lorenz not Lorentz) maps \([0, 1] \to [0, 1]\).

\[L(x) = \sum_{i=1}^{\lfloor nx \rfloor} p_i .\]

Note that this is a piecewise constant function.

Question

Compute and plot the Lorenz function for a given sex, year and country

Code

make_lorenz_df <- function(df) {  
  df |>
  group_by(year, sex) |>
  arrange(n) |>
  mutate(rr=row_number()/n(), L=cumsum(n)/sum(n),  p=n/sum(n)) |>
  ungroup()
}

1: The three expressions defining rr, L and p act as window functions. The window is defined by partitioning by year, sex and ordering by n. In SQL parlance: WINDOW w AS (PARTITION BY year, sex ORDER BY n)

Code

df_lorenz_fr <- df_fr |> 
  filter(name != '_PRENOMS_RARES' &  !is.na(year)) |>
  make_lorenz_df()

df_lorenz_us <- babynames |> 
  make_lorenz_df()

Code

plot_lorenz <- function(df, ze_year=2020, ze_country='fr'){
  df |>
  filter(year==ze_year) |>
  ggplot() +
    aes(x=rr, y=L, linetype=sex) +
    geom_line()  +
    coord_fixed() +
    xlab("") +
    ylab("") +
    geom_abline(intercept=0, slope=1, linetype="dotted") +
    labs(title="Lorenz curve for babynames distribution",
         subtitle=glue("Year: {ze_year}"),
         caption=glue("Country: {ze_country}")
    )
}

Code

plot_lorenz(df_lorenz_fr, 2010, 'France') |
plot_lorenz(df_lorenz_us, 2010, 'USA')

Code

plot_lorenz(df_lorenz_fr, 1910, 'France') |
plot_lorenz(df_lorenz_us, 1910, 'USA')

Question

Design an animated plot that shows the evolution of the Lorenz curve of babynames distribution through the years for a given sex and country.

Code

p_inter <- filter(df_lorenz_fr, 
                  year %% 5 ==0, 
                  floor(rr*100)%% 5==0) |>
  ggplot() +
    aes(x=rr, y=L, linetype=sex, frame=year) +
    geom_line()  +
    coord_fixed() +
    xlab("") +
    ylab("") +
    geom_abline(intercept=0, slope=1, linetype="dotted") 

(p_inter + 
    labs(title="Lorenz curve for babynames distribution",
         caption=glue("Country: France")
    )) |> plotly::ggplotly()

Code

(
  p_inter %+% 
    filter(df_lorenz_us,
           year %% 5 ==0, 
           floor(rr*100)%% 5==0)  +
    labs(title="Lorenz curve for babynames distribution",
         caption=glue("Country: US"))
) |> plotly::ggplotly()

Inequality indices

The Lorenz curve summarizes how far a discrete probability distribution is from the uniform distribution. This is a very rich summary and it is difficult to communicate this message to a wide audience. People tend to favor numerical indices (they don’t really understand, but they get used to it): Gini, Atkinson, Theil, …

The Gini index is twice the surface of the area comprised between curves \(y=x\) and \(y=L(x)\).

\[G = 2 \times \int_0^1 (x -L(x)) \mathrm{d}x\]

The next formula allows us to compute it efficiently.

\[G={\frac {2\sum _{i=1}^{n}i p_{i}}{n\sum _{i=1}^{n}p_{i}}}-{\frac {n+1}{n}}.\]

Question

Compute and plot Gini index of names distribution over time for sex and countries

Code

p_gini <- df_lorenz_fr |>
  group_by(year, sex) |>
  summarize(gini=2 * sum(rr*p) - 1 - 1/n()) |>
  ggplot() +
  aes(x=year, y=gini, linetype=sex) +
  geom_line() +
  theme(legend.position="none") +
  ylab("Gini index") 

for(y in c(1914, 1918, 1938, 1945, 1958, 1969)) {
  p_gini <- p_gini + 
    geom_vline(xintercept = y, linetype="dotted") 
}  

p_gini_fr <- p_gini +
  labs(subtitle="Country: France")

Code

p_gini_us <- (
  p_gini %+% 
    (df_lorenz_us |>
     group_by(year, sex) |>
     summarize(gini=2 * sum(rr*p) - 1 - 1/n(), .groups="drop")) +  
     labs(
      subtitle="Country: US"
  )
)

Code

(p_gini_fr| p_gini_us) +
  plot_annotation(
    title="Gini index of names distributions",
    subtitle="..."
)

Code

giniplot <- function (df) {
  df |>
  filter(name != '_PRENOMS_RARES' &  !is.na(year)) |>
  group_by(year, sex) |>
  mutate(gini=ineq::ineq(n)) |>
  ggplot() +
  aes(x=year, y=gini, linetype=sex) +
  geom_line() +
  theme(legend.position = "none")
}

p1 <- giniplot(filter(df_fr, year> 1947))
p2 <- giniplot(filter(babynames, year>1947))


( p1 | p2 ) +
  plot_annotation(
    title = "Evolution of Gini coeffcients of babynames distribution",
    subtitle="France (left), USA (right) \n plain: girls  dotted: boys"
  )

`PRENOMS RARES` in France

Question

For each sex, Plot the proportion of births given _PRENOMS_RARES as a function of year.

Code

df_fr |>
  filter(!is.na(year)) |>
  group_by(year, sex) |>
  mutate(total=sum(n)) |>
  filter(name=='_PRENOMS_RARES') |>
  select(-name) |>
  mutate(share= 100*n/total) |>
  ungroup() |> 
  ggplot() +
    aes(x=year, y=share, color=sex, linetype=sex) +
    geom_line() +
    labs(
      title="Share of rare names",
      subtitle="French data (INSEE)"
    ) +
  theme_minimal()

1: Here sum() works as a window function over partition by year, sex.
2: This should not be necessary. Inconsistency in quarto ?

Look for Mary in US Data

Marie, Jeanne and France in France

Question

Plot the proportion of female births given name ‘MARIE’ or ‘MARIE-…’ as a fucntion of year. Proceed in such a way that the reader can see the share of compounded names. We are expecting an area plot

Have a look at r-graph-gallery: stacked area and at ggplot documentation. Pay attention on the way you stack the area corresponding to names matching pattern ’MARIE-.*’ over or under the are corresponding to babies named ‘MARIE’

Code

theme_set(theme_minimal())

share_name  <- function(data, .name_stem='MARIE', .sex='F'){
  data %>%
  dplyr::filter(sex==.sex, !is.na(year)) %>%
  select(-sex) %>% 
  group_by(year) %>%
  summarize(strict=sum(ifelse(name==.name_stem, n, 0)),
            loose=sum(ifelse(stringr::str_starts(name, glue('{.name_stem}-')), n, 0)),
            total=sum(n)) %>% 
  transmute(year=year,
            strict=strict/total, 
            loose=loose/total) %>%
  pivot_longer(strict:loose,
               names_to=c("set"),
               values_to="share") %>% 
  mutate(set=factor(set, 
                    levels=c("loose", "strict"), 
                    ordered=TRUE)) 
}

Code

decline_and_fall <- function(df, .name_stem = "MARIE", .sex = "F"){

  df <- share_name(df, .name_stem, .sex) 
  maxshare <- max(pull(df, share), na.rm = T)

  p <- df |> 
    ggplot(aes(x=year)) +
    geom_area(aes(y=share, 
                  fill=set),
              position="stack") +
    ylab("share") +
    annotate('text', 
            label="1st WW", 
            x = 1916, 
            y=0.1*maxshare) + 
    annotate('text',  
             label="2nd WW", 
             x = 1942, 
             y=0.1*maxshare) + 
    annotate("text", 
             label= "1969", 
             x= 1968, 
             y= 0.5*maxshare) +
    theme_minimal()

  for(y in c(1914, 1918, 1938, 1945, 1958, 1969)) {
    p <- p + geom_vline(xintercept = y, linetype="dotted") 
  }
  p
}

decline_and_fall(df_fr, .name_stem = "MARIE", .sex="F")

See Graphique 3, page 48, de L’archipel français de J. Fourquet. Le Seuil. Essais. Vol. 898.

Question

Answer the same question for JEANNE and FRANCE

Code

p_jeanne <- decline_and_fall(df_fr, "JEANNE")
# p_jeanne

Code

p_france <- decline_and_fall(df_fr, "FRANCE")

# p_france

Code

patchw <- p_jeanne / p_france 

patchw + plot_annotation(
  title="Decline of classic names",
  subtitle="Jeanne and France"
)

Patterns of popularity

Question

Plot the popularities of KEVIN, ENZO, STÉPHANE as a function of year.

Code

df_accounted_births_fr <- rename(df_accounted_births_fr, total=n)

Code

prenoms <- c("STÉPHANE", "KEVIN", "ENZO")

df_fr |>
  filter(year>1947) |>
  filter(name %in% prenoms, sex=="M") %>% 
  inner_join(df_accounted_births_fr, by=c("year", "sex")) %>% 
  ggplot() +
  aes(x=year, y=n/total, linetype=name) +
  geom_line() +
  scale_y_log10() +
  ggtitle(glue("Popularities"))

We can investigate surges of popularity for female English names in the way we did for male French names.

Code

hypenames <- c('Jessica', 'Jennifer', 'Dana', 'Monica', 'Laura')

(babynames %>% 
  filter(year > 1947) |> 
  filter(name %in% hypenames, sex=='F') %>% 
  inner_join(babynames::births, by=c("year")) %>% 
  ggplot() +
  aes(x=year, y=2*n/births, linetype=name, colour=name) +
  geom_line() +
  scale_y_log10() +
  ggtitle(glue("Popularities"))) |> 
    plotly::ggplotly()

Question

Plot the popularities of “JEAN”, “LUC”, “MATHIEU”, “MARC”, “PAUL”, “PIERRE”, “JOSEPH”, “FRANÇOIS” as a function of year. Use stacked area style plot.

Code

prenoms <- c("JEAN", "LUC", "MATHIEU", "MARC", "PAUL", "PIERRE", "JOSEPH", "FRANÇOIS")

df_fr %>% 
  filter(year>1947) |>
  filter(name %in% prenoms, sex=="M") %>% 
  mutate(name= as_factor(name)) %>% 
  mutate(name= fct_rev(name)) %>% 
  inner_join(df_accounted_births_fr, by=c("year", "sex")) %>% 
  ggplot() +
  aes(x=year, y=n/total, linetype=name, fill=name) +
  scale_fill_viridis_d() +
  geom_area(position = "stack") +
#  scale_y_log10() +
  ggtitle(glue("Popularities of Boy Names"))

Question

Plot the popularities of “JEAN”, “LUC”, “MATHIEU”, “MARC”, “PAUL”, “PIERRE”, “JOSEPH”, “FRANÇOIS” as a function of year. Use line plot.

Code

q <- (df_fr %>% 
  filter(year > 1947) |>
  filter(name %in% prenoms, sex=="M") %>% 
  inner_join(df_accounted_births_fr, by=c("year", "sex")) %>% 
  ggplot() +
  aes(x=year, y=n/total, linetype=name, colour=name) +
  geom_line() +
  scale_y_log10() +
  ggtitle(glue("Popularities"))) |> 
    plotly::ggplotly()

q

Question

Look for the translation of these names in US Data

Code

firstnames <- str_to_title(c("JOHN", "LUKE", "MATTHEW", "MARK", "PAUL", "PETER", "JOSEPH", "FRANCIS"))

(babynames %>% 
  filter(year > 1947) |> 
  filter(name %in% firstnames, sex=='M') %>% 
  inner_join(babynames::births, by=c("year")) %>% 
  ggplot() +
  aes(x=year, y=2*n/births, linetype=name, colour=name) +
  geom_line() +
  scale_y_log10() +
  ggtitle(glue("Popularities"))) |> 
    plotly::ggplotly()

The variations of popularity exhibit different patterns

Some names declined steadily after second world war.
Other names started from a very low popularity and enjoyed a rapid increase in popularity over one or two decades. Afterwards, these names rapidly lost the public favor and returned to obscurity.

Naming babies

French data

US data

Tidy the French data

Tidy the American data

Sex ratios

Picturing concentration of babynames distributions

Inequality indices

PRENOMS RARES in France

Marie, Jeanne and France in France

Patterns of popularity

`PRENOMS RARES` in France