Code
require(patchwork)
require(httr)
require(glue)
require(ineq)
require(here)
require(skimr)
require(magrittr)
require(tidyverse)
old_theme <- theme_set(theme_minimal())require(patchwork)
require(httr)
require(glue)
require(ineq)
require(here)
require(skimr)
require(magrittr)
require(tidyverse)
old_theme <- theme_set(theme_minimal())The French data are built and made available by INSEE (French Governement Statistics Institute)
This dataset has been growing for a while. It has been considered by social scientists for decades. Given names are meant to give insights into a variety of phenomena, including religious observance.
A glimpse at that body of work can be found in L’archipel français by Jérome Fourquet, Le Seuil, 2019
Read the File documentation
path_data <- 'DATA'
fname <- 'nat2021_csv.zip'
fpath <- here(path_data, fname)
if (!file.exists(fpath)){
url <- "https://www.insee.fr/fr/statistiques/fichier/2540004/nat2021_csv.zip"
download.file(url, fpath)
}
df_fr <- readr::read_csv2(fpath)
df_fr |> glimpse()Rows: 686,538
Columns: 4
$ sexe <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ preusuel <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENO…
$ annais <chr> "1900", "1901", "1902", "1903", "1904", "1905", "1906", "1907…
$ nombre <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 1…
US data may be gathered from
Baby Names USA from 1910 to 2021 (SSA)
See https://www.ssa.gov/oact/babynames/background.html
It can also be obtained by installing and loading the “babynames” package.
Full baby name data provided by the SSA. This includes all names with at least 5 uses.
if (!require("babynames")){
install.packages("babynames")
stopifnot(require("babynames"), "Couldn't install and load package 'babynames'")
}?babynamesRename columns according to the next lookup table:
lkp <- list(year="annais",
sex="sexe",
name="preusuel",
n="nombre")df_fr <- df_fr |>
rename(!!!lkp) |>
mutate(country='fr') |>
mutate(sex=as_factor(sex)) |>
mutate(sex=fct_recode(sex, "M"="1", "F"="2")) |>
mutate(sex=fct_relevel(sex, "F", "M")) |>
mutate(year=ifelse(year=="XXXX", NA, year)) |>
mutate(year=as.integer(year))
df_fr |>
sample(5) |>
glimpse()!!! (bang-bang-bang) is offered by rlang package. Here, we use it to perform list unpacking (with the same intent and purposes we use dictionary unpacking in Python)
Rows: 686,538
Columns: 5
$ name <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOM…
$ country <chr> "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "f…
$ year <int> 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 19…
$ n <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 16…
$ sex <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M,…
Download ‘Naissances totales par sexe’ from URL https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls from INED.
births_fr_path <- here(path_data, 't35.fr.xls')
births_fr_url <- 'https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls'
if (!file.exists(births_fr_path)) {
download.file(births_fr_url, births_fr_path)
}births_fr <- readxl::read_excel(births_fr_path, skip = 3)
births_fr <- births_fr[-1, ]
births_fr |>
glimpse()Rows: 130
Columns: 10
$ `Répartition par sexe et vie` <chr> "1901", "1902", "1903", "…
$ `Ensemble des nés vivants` <dbl> 917075, 904434, 884498, 8…
$ `Nés vivants - Garçons` <dbl> 468125, 462097, 451510, 4…
$ `Nés vivants - Filles` <dbl> 448950, 442337, 432988, 4…
$ `Ensemble des enfants sans vie` <dbl> 32410, 32000, 31076, 3067…
$ `Enfants sans vie - Garçons` <chr> "18522", "18172", "17875"…
$ `Enfants sans vie - Filles` <chr> "13888", "13828", "13201"…
$ `Garçons vivants pour 100 nés\nvivants` <dbl> 51.0, 51.1, 51.0, 51.0, 5…
$ `Garçons vivants pour 100\nfilles vivantes` <dbl> 104.3, 104.5, 104.3, 104.…
$ `Garçons sans vie pour 100\nfilles sans vie` <chr> "133.40000000000001", "13…
If you have problems with the excel reader, feel free to download an equivalent csv file from url
names(births_fr)[1] <- "year"births_fr <- births_fr |>
mutate(year=as.integer(year)) |>
drop_na()
births_fr |>
ggplot() +
aes(x=year, y=`Ensemble des nés vivants`) +
geom_col() +
labs(title="Births in France")babynames <- babynames |>
mutate(country='us') |>
mutate(sex=as_factor(sex))
babynames |>
glimpse()Rows: 1,924,665
Columns: 6
$ year <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 18…
$ sex <fct> F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
$ name <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret", "Id…
$ n <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 1288, 12…
$ prop <dbl> 0.07238359, 0.02667896, 0.02052149, 0.01986579, 0.01788843, 0.…
$ country <chr> "us", "us", "us", "us", "us", "us", "us", "us", "us", "us", "u…
births_us <- births
births_us |>
ggplot() +
aes(x=year, y=births) +
geom_col() +
labs(title="Births in USA")In dataset df_fr compute the total number of reported male and female births per year. Compute and plot the sex ratio.
df_accounted_births_fr <- df_fr |>
group_by(year, sex) |>
summarise(n=sum(n))
df_accounted_births_fr |>
glimpse()Rows: 246
Columns: 3
Groups: year [123]
$ year <int> 1900, 1900, 1901, 1901, 1902, 1902, 1903, 1903, 1904, 1904, 1905,…
$ sex <fct> F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M,…
$ n <dbl> 237653, 177387, 257492, 195964, 261437, 204354, 261450, 207360, 2…
df_app_sex_ratio_fr <- df_accounted_births_fr |>
pivot_wider(id_cols=year,
names_from=sex,
values_from=`n`) |>
mutate(`Garçons vivants pour 100\nfilles vivantes`=100*M/F)
df_app_sex_ratio_fr |>
glimpse()Rows: 123
Columns: 4
Groups: year [123]
$ year <int> 1900, 1901, 1902, 1903, 19…
$ F <dbl> 237653, 257492, 261437, 26…
$ M <dbl> 177387, 195964, 204354, 20…
$ `Garçons vivants pour 100\nfilles vivantes` <dbl> 74.64118, 76.10489, 78.165…
p_app_sex_ratio_fr <- df_app_sex_ratio_fr |>
ggplot() +
aes(x=year, y=`Garçons vivants pour 100\nfilles vivantes`) +
geom_col() +
theme_minimal()
p_app_sex_ratio_fr +
labs(
title="France: Apparent sex ratio",
subtitle="Dataset: 'nat2021_csv' (INSEE)"
) Compare with sex ratio as given in dataset from INED
p_sex_ratio_fr <- p_app_sex_ratio_fr %+%
births_fr
p_sex_ratio_fr + labs(
title="France: sex ratio",
subtitle="Dataset INED") (p_app_sex_ratio_fr + p_sex_ratio_fr) +
plot_annotation(
title="Evolution of sex ratio at birth in France",
subtitle="Left: INSEE data. Right: INED data"
)df_app_sex_ratio_fr |>
inner_join(births_fr, by="year") |>
glimpse()Rows: 121
Columns: 13
Groups: year [121]
$ year <int> 1901, 1902, 1903, 1904, …
$ F <dbl> 257492, 261437, 261450, …
$ M <dbl> 195964, 204354, 207360, …
$ `Garçons vivants pour 100\nfilles vivantes.x` <dbl> 76.10489, 78.16568, 79.3…
$ `Ensemble des nés vivants` <dbl> 917075, 904434, 884498, …
$ `Nés vivants - Garçons` <dbl> 468125, 462097, 451510, …
$ `Nés vivants - Filles` <dbl> 448950, 442337, 432988, …
$ `Ensemble des enfants sans vie` <dbl> 32410, 32000, 31076, 306…
$ `Enfants sans vie - Garçons` <chr> "18522", "18172", "17875…
$ `Enfants sans vie - Filles` <chr> "13888", "13828", "13201…
$ `Garçons vivants pour 100 nés\nvivants` <dbl> 51.0, 51.1, 51.0, 51.0, …
$ `Garçons vivants pour 100\nfilles vivantes.y` <dbl> 104.3, 104.5, 104.3, 104…
$ `Garçons sans vie pour 100\nfilles sans vie` <chr> "133.40000000000001", "1…
df_app_sex_ratio_fr |>
inner_join(births_fr, by="year") |>
ggplot() +
aes(x=year, y=`Garçons vivants pour 100\nfilles vivantes.x`/`Garçons vivants pour 100\nfilles vivantes.y`) +
geom_point(size=.5) +
scale_y_log10() +
ylab('Ratio between apparent sex ratio and\n exact sex ratio') +
labs(
title="French data, confronting INSEE and INED data"
)Consider the fluctuations of the sex ratio through the years.
Are they consistent with the hypothesis: the sex of newborns are independently. identically distributed with the probability of getting a girl equal to \(.48\)?
Consider again the fluctuations of the sex ratio through the years.
Assume that for each year the sex of newborns are independently. identically distributed with the probability of getting a girl depending on the year.
Are the data consistent with the hypothesis: the probability of getting a girl remains constant thoughout the years?
Every year, in each country, for both sex, the name counts define a discrete probability distribution over the set of names (the universe).
This distribution, just as an income or wealth distribution, is (usually) far from being uniform. We want to assess how uneven it is.
We use the tools developed in econometrics.
Without loss of generality, we assume that we handle a distribution over positive integers \(1, \ldots, n\) where \(n\) is the number of distinct names given during a year.
We assume that frequencies \(p_1, p_2, \ldots, p_n\) are given in ascending order, ties are broken arbitrarily.
The Lorenz function (Lorenz not Lorentz) maps \([0, 1] \to [0, 1]\).
\[L(x) = \sum_{i=1}^{\lfloor nx \rfloor} p_i .\]
Note that this is a piecewise constant function.
Compute and plot the Lorenz function for a given sex, year and country
make_lorenz_df <- function(df) {
df |>
group_by(year, sex) |>
arrange(n) |>
mutate(rr=row_number()/n(), L=cumsum(n)/sum(n), p=n/sum(n)) |>
ungroup()
}rr, L and p act as window functions. The window is defined by partitioning by year, sex and ordering by n. In SQL parlance: WINDOW w AS (PARTITION BY year, sex ORDER BY n)
df_lorenz_fr <- df_fr |>
filter(name != '_PRENOMS_RARES' & !is.na(year)) |>
make_lorenz_df()
df_lorenz_us <- babynames |>
make_lorenz_df()plot_lorenz <- function(df, ze_year=2020, ze_country='fr'){
df |>
filter(year==ze_year) |>
ggplot() +
aes(x=rr, y=L, linetype=sex) +
geom_line() +
coord_fixed() +
xlab("") +
ylab("") +
geom_abline(intercept=0, slope=1, linetype="dotted") +
labs(title="Lorenz curve for babynames distribution",
subtitle=glue("Year: {ze_year}"),
caption=glue("Country: {ze_country}")
)
} plot_lorenz(df_lorenz_fr, 2010, 'France') |
plot_lorenz(df_lorenz_us, 2010, 'USA') plot_lorenz(df_lorenz_fr, 1910, 'France') |
plot_lorenz(df_lorenz_us, 1910, 'USA') Design an animated plot that shows the evolution of the Lorenz curve of babynames distribution through the years for a given sex and country.
p_inter <- filter(df_lorenz_fr,
year %% 5 ==0,
floor(rr*100)%% 5==0) |>
ggplot() +
aes(x=rr, y=L, linetype=sex, frame=year) +
geom_line() +
coord_fixed() +
xlab("") +
ylab("") +
geom_abline(intercept=0, slope=1, linetype="dotted")
(p_inter +
labs(title="Lorenz curve for babynames distribution",
caption=glue("Country: France")
)) |> plotly::ggplotly()(
p_inter %+%
filter(df_lorenz_us,
year %% 5 ==0,
floor(rr*100)%% 5==0) +
labs(title="Lorenz curve for babynames distribution",
caption=glue("Country: US"))
) |> plotly::ggplotly()The Lorenz curve summarizes how far a discrete probability distribution is from the uniform distribution. This is a very rich summary and it is difficult to communicate this message to a wide audience. People tend to favor numerical indices (they don’t really understand, but they get used to it): Gini, Atkinson, Theil, …
The Gini index is twice the surface of the area comprised between curves \(y=x\) and \(y=L(x)\).
\[G = 2 \times \int_0^1 (x -L(x)) \mathrm{d}x\]
The next formula allows us to compute it efficiently.
\[G={\frac {2\sum _{i=1}^{n}i p_{i}}{n\sum _{i=1}^{n}p_{i}}}-{\frac {n+1}{n}}.\]
Compute and plot Gini index of names distribution over time for sex and countries
p_gini <- df_lorenz_fr |>
group_by(year, sex) |>
summarize(gini=2 * sum(rr*p) - 1 - 1/n()) |>
ggplot() +
aes(x=year, y=gini, linetype=sex) +
geom_line() +
theme(legend.position="none") +
ylab("Gini index")
for(y in c(1914, 1918, 1938, 1945, 1958, 1969)) {
p_gini <- p_gini +
geom_vline(xintercept = y, linetype="dotted")
}
p_gini_fr <- p_gini +
labs(subtitle="Country: France")p_gini_us <- (
p_gini %+%
(df_lorenz_us |>
group_by(year, sex) |>
summarize(gini=2 * sum(rr*p) - 1 - 1/n(), .groups="drop")) +
labs(
subtitle="Country: US"
)
)(p_gini_fr| p_gini_us) +
plot_annotation(
title="Gini index of names distributions",
subtitle="..."
)giniplot <- function (df) {
df |>
filter(name != '_PRENOMS_RARES' & !is.na(year)) |>
group_by(year, sex) |>
mutate(gini=ineq::ineq(n)) |>
ggplot() +
aes(x=year, y=gini, linetype=sex) +
geom_line() +
theme(legend.position = "none")
}
p1 <- giniplot(filter(df_fr, year> 1947))
p2 <- giniplot(filter(babynames, year>1947))
( p1 | p2 ) +
plot_annotation(
title = "Evolution of Gini coeffcients of babynames distribution",
subtitle="France (left), USA (right) \n plain: girls dotted: boys"
)PRENOMS RARES in FranceFor each sex, Plot the proportion of births given _PRENOMS_RARES as a function of year.
df_fr |>
filter(!is.na(year)) |>
group_by(year, sex) |>
mutate(total=sum(n)) |>
filter(name=='_PRENOMS_RARES') |>
select(-name) |>
mutate(share= 100*n/total) |>
ungroup() |>
ggplot() +
aes(x=year, y=share, color=sex, linetype=sex) +
geom_line() +
labs(
title="Share of rare names",
subtitle="French data (INSEE)"
) +
theme_minimal()sum() works as a window function over partition by year, sex.
Mary in US Data
Plot the proportion of female births given name ‘MARIE’ or ‘MARIE-…’ as a fucntion of year. Proceed in such a way that the reader can see the share of compounded names. We are expecting an area plot
Have a look at r-graph-gallery: stacked area and at ggplot documentation. Pay attention on the way you stack the area corresponding to names matching pattern ’MARIE-.*’ over or under the are corresponding to babies named ‘MARIE’
theme_set(theme_minimal())
share_name <- function(data, .name_stem='MARIE', .sex='F'){
data %>%
dplyr::filter(sex==.sex, !is.na(year)) %>%
select(-sex) %>%
group_by(year) %>%
summarize(strict=sum(ifelse(name==.name_stem, n, 0)),
loose=sum(ifelse(stringr::str_starts(name, glue('{.name_stem}-')), n, 0)),
total=sum(n)) %>%
transmute(year=year,
strict=strict/total,
loose=loose/total) %>%
pivot_longer(strict:loose,
names_to=c("set"),
values_to="share") %>%
mutate(set=factor(set,
levels=c("loose", "strict"),
ordered=TRUE))
} decline_and_fall <- function(df, .name_stem = "MARIE", .sex = "F"){
df <- share_name(df, .name_stem, .sex)
maxshare <- max(pull(df, share), na.rm = T)
p <- df |>
ggplot(aes(x=year)) +
geom_area(aes(y=share,
fill=set),
position="stack") +
ylab("share") +
annotate('text',
label="1st WW",
x = 1916,
y=0.1*maxshare) +
annotate('text',
label="2nd WW",
x = 1942,
y=0.1*maxshare) +
annotate("text",
label= "1969",
x= 1968,
y= 0.5*maxshare) +
theme_minimal()
for(y in c(1914, 1918, 1938, 1945, 1958, 1969)) {
p <- p + geom_vline(xintercept = y, linetype="dotted")
}
p
}
decline_and_fall(df_fr, .name_stem = "MARIE", .sex="F")See Graphique 3, page 48, de L’archipel français de J. Fourquet. Le Seuil. Essais. Vol. 898.
Answer the same question for JEANNE and FRANCE
p_jeanne <- decline_and_fall(df_fr, "JEANNE")
# p_jeannep_france <- decline_and_fall(df_fr, "FRANCE")
# p_france patchw <- p_jeanne / p_france
patchw + plot_annotation(
title="Decline of classic names",
subtitle="Jeanne and France"
)Plot the popularities of KEVIN, ENZO, STÉPHANE as a function of year.
df_accounted_births_fr <- rename(df_accounted_births_fr, total=n)prenoms <- c("STÉPHANE", "KEVIN", "ENZO")
df_fr |>
filter(year>1947) |>
filter(name %in% prenoms, sex=="M") %>%
inner_join(df_accounted_births_fr, by=c("year", "sex")) %>%
ggplot() +
aes(x=year, y=n/total, linetype=name) +
geom_line() +
scale_y_log10() +
ggtitle(glue("Popularities"))We can investigate surges of popularity for female English names in the way we did for male French names.
hypenames <- c('Jessica', 'Jennifer', 'Dana', 'Monica', 'Laura')
(babynames %>%
filter(year > 1947) |>
filter(name %in% hypenames, sex=='F') %>%
inner_join(babynames::births, by=c("year")) %>%
ggplot() +
aes(x=year, y=2*n/births, linetype=name, colour=name) +
geom_line() +
scale_y_log10() +
ggtitle(glue("Popularities"))) |>
plotly::ggplotly()Plot the popularities of “JEAN”, “LUC”, “MATHIEU”, “MARC”, “PAUL”, “PIERRE”, “JOSEPH”, “FRANÇOIS” as a function of year. Use stacked area style plot.
prenoms <- c("JEAN", "LUC", "MATHIEU", "MARC", "PAUL", "PIERRE", "JOSEPH", "FRANÇOIS")
df_fr %>%
filter(year>1947) |>
filter(name %in% prenoms, sex=="M") %>%
mutate(name= as_factor(name)) %>%
mutate(name= fct_rev(name)) %>%
inner_join(df_accounted_births_fr, by=c("year", "sex")) %>%
ggplot() +
aes(x=year, y=n/total, linetype=name, fill=name) +
scale_fill_viridis_d() +
geom_area(position = "stack") +
# scale_y_log10() +
ggtitle(glue("Popularities of Boy Names"))Plot the popularities of “JEAN”, “LUC”, “MATHIEU”, “MARC”, “PAUL”, “PIERRE”, “JOSEPH”, “FRANÇOIS” as a function of year. Use line plot.
q <- (df_fr %>%
filter(year > 1947) |>
filter(name %in% prenoms, sex=="M") %>%
inner_join(df_accounted_births_fr, by=c("year", "sex")) %>%
ggplot() +
aes(x=year, y=n/total, linetype=name, colour=name) +
geom_line() +
scale_y_log10() +
ggtitle(glue("Popularities"))) |>
plotly::ggplotly()
q Look for the translation of these names in US Data
firstnames <- str_to_title(c("JOHN", "LUKE", "MATTHEW", "MARK", "PAUL", "PETER", "JOSEPH", "FRANCIS"))
(babynames %>%
filter(year > 1947) |>
filter(name %in% firstnames, sex=='M') %>%
inner_join(babynames::births, by=c("year")) %>%
ggplot() +
aes(x=year, y=2*n/births, linetype=name, colour=name) +
geom_line() +
scale_y_log10() +
ggtitle(glue("Popularities"))) |>
plotly::ggplotly()The variations of popularity exhibit different patterns