Code
require(patchwork)
require(httr)
require(glue)
require(ineq)
require(here)
require(skimr)
require(magrittr)
require(tidyverse)
<- theme_set(theme_minimal()) old_theme
require(patchwork)
require(httr)
require(glue)
require(ineq)
require(here)
require(skimr)
require(magrittr)
require(tidyverse)
<- theme_set(theme_minimal()) old_theme
The French data are built and made available by INSEE (French Governement Statistics Institute)
This dataset has been growing for a while. It has been considered by social scientists for decades. Given names are meant to give insights into a variety of phenomena, including religious observance.
A glimpse at that body of work can be found in L’archipel français by Jérome Fourquet, Le Seuil, 2019
Read the File documentation
<- 'DATA'
path_data <- 'nat2021_csv.zip'
fname <- here(path_data, fname)
fpath if (!file.exists(fpath)){
<- "https://www.insee.fr/fr/statistiques/fichier/2540004/nat2021_csv.zip"
url download.file(url, fpath)
}
<- readr::read_csv2(fpath)
df_fr
|> glimpse() df_fr
Rows: 686,538
Columns: 4
$ sexe <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ preusuel <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENO…
$ annais <chr> "1900", "1901", "1902", "1903", "1904", "1905", "1906", "1907…
$ nombre <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 1…
US data may be gathered from
Baby Names USA from 1910 to 2021 (SSA)
See https://www.ssa.gov/oact/babynames/background.html
It can also be obtained by installing and loading the “babynames” package.
Full baby name data provided by the SSA. This includes all names with at least 5 uses.
if (!require("babynames")){
install.packages("babynames")
stopifnot(require("babynames"), "Couldn't install and load package 'babynames'")
}
?babynames
Rename columns according to the next lookup table:
<- list(year="annais",
lkp sex="sexe",
name="preusuel",
n="nombre")
<- df_fr |>
df_fr rename(!!!lkp) |>
mutate(country='fr') |>
mutate(sex=as_factor(sex)) |>
mutate(sex=fct_recode(sex, "M"="1", "F"="2")) |>
mutate(sex=fct_relevel(sex, "F", "M")) |>
mutate(year=ifelse(year=="XXXX", NA, year)) |>
mutate(year=as.integer(year))
|>
df_fr sample(5) |>
glimpse()
!!!
(bang-bang-bang) is offered by rlang
package. Here, we use it to perform list unpacking (with the same intent and purposes we use dictionary unpacking in Python)
Rows: 686,538
Columns: 5
$ name <chr> "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOMS_RARES", "_PRENOM…
$ country <chr> "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "fr", "f…
$ year <int> 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 19…
$ n <dbl> 1249, 1342, 1330, 1286, 1430, 1472, 1451, 1514, 1509, 1526, 16…
$ sex <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M,…
Download ‘Naissances totales par sexe’ from URL https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls
from INED.
<- here(path_data, 't35.fr.xls')
births_fr_path <- 'https://www.ined.fr/fichier/s_rubrique/168/t35.fr.xls'
births_fr_url
if (!file.exists(births_fr_path)) {
download.file(births_fr_url, births_fr_path)
}
<- readxl::read_excel(births_fr_path, skip = 3)
births_fr
<- births_fr[-1, ]
births_fr
|>
births_fr glimpse()
Rows: 130
Columns: 10
$ `Répartition par sexe et vie` <chr> "1901", "1902", "1903", "…
$ `Ensemble des nés vivants` <dbl> 917075, 904434, 884498, 8…
$ `Nés vivants - Garçons` <dbl> 468125, 462097, 451510, 4…
$ `Nés vivants - Filles` <dbl> 448950, 442337, 432988, 4…
$ `Ensemble des enfants sans vie` <dbl> 32410, 32000, 31076, 3067…
$ `Enfants sans vie - Garçons` <chr> "18522", "18172", "17875"…
$ `Enfants sans vie - Filles` <chr> "13888", "13828", "13201"…
$ `Garçons vivants pour 100 nés\nvivants` <dbl> 51.0, 51.1, 51.0, 51.0, 5…
$ `Garçons vivants pour 100\nfilles vivantes` <dbl> 104.3, 104.5, 104.3, 104.…
$ `Garçons sans vie pour 100\nfilles sans vie` <chr> "133.40000000000001", "13…
If you have problems with the excel reader, feel free to download an equivalent csv
file from url
names(births_fr)[1] <- "year"
<- births_fr |>
births_fr mutate(year=as.integer(year)) |>
drop_na()
|>
births_fr ggplot() +
aes(x=year, y=`Ensemble des nés vivants`) +
geom_col() +
labs(title="Births in France")
<- babynames |>
babynames mutate(country='us') |>
mutate(sex=as_factor(sex))
|>
babynames glimpse()
Rows: 1,924,665
Columns: 6
$ year <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 18…
$ sex <fct> F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
$ name <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret", "Id…
$ n <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 1288, 12…
$ prop <dbl> 0.07238359, 0.02667896, 0.02052149, 0.01986579, 0.01788843, 0.…
$ country <chr> "us", "us", "us", "us", "us", "us", "us", "us", "us", "us", "u…
<- births
births_us
|>
births_us ggplot() +
aes(x=year, y=births) +
geom_col() +
labs(title="Births in USA")
In dataset df_fr
compute the total number of reported male and female births per year. Compute and plot the sex ratio.
<- df_fr |>
df_accounted_births_fr group_by(year, sex) |>
summarise(n=sum(n))
|>
df_accounted_births_fr glimpse()
Rows: 246
Columns: 3
Groups: year [123]
$ year <int> 1900, 1900, 1901, 1901, 1902, 1902, 1903, 1903, 1904, 1904, 1905,…
$ sex <fct> F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M, F, M,…
$ n <dbl> 237653, 177387, 257492, 195964, 261437, 204354, 261450, 207360, 2…
<- df_accounted_births_fr |>
df_app_sex_ratio_fr pivot_wider(id_cols=year,
names_from=sex,
values_from=`n`) |>
mutate(`Garçons vivants pour 100\nfilles vivantes`=100*M/F)
|>
df_app_sex_ratio_fr glimpse()
Rows: 123
Columns: 4
Groups: year [123]
$ year <int> 1900, 1901, 1902, 1903, 19…
$ F <dbl> 237653, 257492, 261437, 26…
$ M <dbl> 177387, 195964, 204354, 20…
$ `Garçons vivants pour 100\nfilles vivantes` <dbl> 74.64118, 76.10489, 78.165…
<- df_app_sex_ratio_fr |>
p_app_sex_ratio_fr ggplot() +
aes(x=year, y=`Garçons vivants pour 100\nfilles vivantes`) +
geom_col() +
theme_minimal()
+
p_app_sex_ratio_fr labs(
title="France: Apparent sex ratio",
subtitle="Dataset: 'nat2021_csv' (INSEE)"
)
Compare with sex ratio as given in dataset from INED
<- p_app_sex_ratio_fr %+%
p_sex_ratio_fr
births_fr
+ labs(
p_sex_ratio_fr title="France: sex ratio",
subtitle="Dataset INED")
+ p_sex_ratio_fr) +
(p_app_sex_ratio_fr plot_annotation(
title="Evolution of sex ratio at birth in France",
subtitle="Left: INSEE data. Right: INED data"
)
|>
df_app_sex_ratio_fr inner_join(births_fr, by="year") |>
glimpse()
Rows: 121
Columns: 13
Groups: year [121]
$ year <int> 1901, 1902, 1903, 1904, …
$ F <dbl> 257492, 261437, 261450, …
$ M <dbl> 195964, 204354, 207360, …
$ `Garçons vivants pour 100\nfilles vivantes.x` <dbl> 76.10489, 78.16568, 79.3…
$ `Ensemble des nés vivants` <dbl> 917075, 904434, 884498, …
$ `Nés vivants - Garçons` <dbl> 468125, 462097, 451510, …
$ `Nés vivants - Filles` <dbl> 448950, 442337, 432988, …
$ `Ensemble des enfants sans vie` <dbl> 32410, 32000, 31076, 306…
$ `Enfants sans vie - Garçons` <chr> "18522", "18172", "17875…
$ `Enfants sans vie - Filles` <chr> "13888", "13828", "13201…
$ `Garçons vivants pour 100 nés\nvivants` <dbl> 51.0, 51.1, 51.0, 51.0, …
$ `Garçons vivants pour 100\nfilles vivantes.y` <dbl> 104.3, 104.5, 104.3, 104…
$ `Garçons sans vie pour 100\nfilles sans vie` <chr> "133.40000000000001", "1…
|>
df_app_sex_ratio_fr inner_join(births_fr, by="year") |>
ggplot() +
aes(x=year, y=`Garçons vivants pour 100\nfilles vivantes.x`/`Garçons vivants pour 100\nfilles vivantes.y`) +
geom_point(size=.5) +
scale_y_log10() +
ylab('Ratio between apparent sex ratio and\n exact sex ratio') +
labs(
title="French data, confronting INSEE and INED data"
)
Consider the fluctuations of the sex ratio through the years.
Are they consistent with the hypothesis: the sex of newborns are independently. identically distributed with the probability of getting a girl equal to \(.48\)?
Consider again the fluctuations of the sex ratio through the years.
Assume that for each year the sex of newborns are independently. identically distributed with the probability of getting a girl depending on the year.
Are the data consistent with the hypothesis: the probability of getting a girl remains constant thoughout the years?
Every year, in each country, for both sex, the name counts define a discrete probability distribution over the set of names (the universe).
This distribution, just as an income or wealth distribution, is (usually) far from being uniform. We want to assess how uneven it is.
We use the tools developed in econometrics.
Without loss of generality, we assume that we handle a distribution over positive integers \(1, \ldots, n\) where \(n\) is the number of distinct names given during a year.
We assume that frequencies \(p_1, p_2, \ldots, p_n\) are given in ascending order, ties are broken arbitrarily.
The Lorenz function
(Lorenz not Lorentz
) maps \([0, 1] \to [0, 1]\).
\[L(x) = \sum_{i=1}^{\lfloor nx \rfloor} p_i .\]
Note that this is a piecewise constant function.
Compute and plot the Lorenz function for a given sex
, year
and country
<- function(df) {
make_lorenz_df |>
df group_by(year, sex) |>
arrange(n) |>
mutate(rr=row_number()/n(), L=cumsum(n)/sum(n), p=n/sum(n)) |>
ungroup()
}
rr
, L
and p
act as window functions. The window is defined by partitioning by year, sex
and ordering by n
. In SQL parlance: WINDOW w AS (PARTITION BY year, sex ORDER BY n)
<- df_fr |>
df_lorenz_fr filter(name != '_PRENOMS_RARES' & !is.na(year)) |>
make_lorenz_df()
<- babynames |>
df_lorenz_us make_lorenz_df()
<- function(df, ze_year=2020, ze_country='fr'){
plot_lorenz |>
df filter(year==ze_year) |>
ggplot() +
aes(x=rr, y=L, linetype=sex) +
geom_line() +
coord_fixed() +
xlab("") +
ylab("") +
geom_abline(intercept=0, slope=1, linetype="dotted") +
labs(title="Lorenz curve for babynames distribution",
subtitle=glue("Year: {ze_year}"),
caption=glue("Country: {ze_country}")
) }
plot_lorenz(df_lorenz_fr, 2010, 'France') |
plot_lorenz(df_lorenz_us, 2010, 'USA')
plot_lorenz(df_lorenz_fr, 1910, 'France') |
plot_lorenz(df_lorenz_us, 1910, 'USA')
Design an animated plot that shows the evolution of the Lorenz curve of babynames distribution through the years for a given sex and country.
<- filter(df_lorenz_fr,
p_inter %% 5 ==0,
year floor(rr*100)%% 5==0) |>
ggplot() +
aes(x=rr, y=L, linetype=sex, frame=year) +
geom_line() +
coord_fixed() +
xlab("") +
ylab("") +
geom_abline(intercept=0, slope=1, linetype="dotted")
+
(p_inter labs(title="Lorenz curve for babynames distribution",
caption=glue("Country: France")
|> plotly::ggplotly() ))
(%+%
p_inter filter(df_lorenz_us,
%% 5 ==0,
year floor(rr*100)%% 5==0) +
labs(title="Lorenz curve for babynames distribution",
caption=glue("Country: US"))
|> plotly::ggplotly() )
The Lorenz curve summarizes how far a discrete probability distribution is from the uniform distribution. This is a very rich summary and it is difficult to communicate this message to a wide audience. People tend to favor numerical indices (they don’t really understand, but they get used to it): Gini, Atkinson, Theil, …
The Gini index is twice the surface of the area comprised between curves \(y=x\) and \(y=L(x)\).
\[G = 2 \times \int_0^1 (x -L(x)) \mathrm{d}x\]
The next formula allows us to compute it efficiently.
\[G={\frac {2\sum _{i=1}^{n}i p_{i}}{n\sum _{i=1}^{n}p_{i}}}-{\frac {n+1}{n}}.\]
Compute and plot Gini index of names distribution over time for sex and countries
<- df_lorenz_fr |>
p_gini group_by(year, sex) |>
summarize(gini=2 * sum(rr*p) - 1 - 1/n()) |>
ggplot() +
aes(x=year, y=gini, linetype=sex) +
geom_line() +
theme(legend.position="none") +
ylab("Gini index")
for(y in c(1914, 1918, 1938, 1945, 1958, 1969)) {
<- p_gini +
p_gini geom_vline(xintercept = y, linetype="dotted")
}
<- p_gini +
p_gini_fr labs(subtitle="Country: France")
<- (
p_gini_us %+%
p_gini |>
(df_lorenz_us group_by(year, sex) |>
summarize(gini=2 * sum(rr*p) - 1 - 1/n(), .groups="drop")) +
labs(
subtitle="Country: US"
) )
| p_gini_us) +
(p_gini_frplot_annotation(
title="Gini index of names distributions",
subtitle="..."
)
<- function (df) {
giniplot |>
df filter(name != '_PRENOMS_RARES' & !is.na(year)) |>
group_by(year, sex) |>
mutate(gini=ineq::ineq(n)) |>
ggplot() +
aes(x=year, y=gini, linetype=sex) +
geom_line() +
theme(legend.position = "none")
}
<- giniplot(filter(df_fr, year> 1947))
p1 <- giniplot(filter(babynames, year>1947))
p2
| p2 ) +
( p1 plot_annotation(
title = "Evolution of Gini coeffcients of babynames distribution",
subtitle="France (left), USA (right) \n plain: girls dotted: boys"
)
PRENOMS RARES
in FranceFor each sex, Plot the proportion of births given _PRENOMS_RARES
as a function of year.
|>
df_fr filter(!is.na(year)) |>
group_by(year, sex) |>
mutate(total=sum(n)) |>
filter(name=='_PRENOMS_RARES') |>
select(-name) |>
mutate(share= 100*n/total) |>
ungroup() |>
ggplot() +
aes(x=year, y=share, color=sex, linetype=sex) +
geom_line() +
labs(
title="Share of rare names",
subtitle="French data (INSEE)"
+
) theme_minimal()
sum()
works as a window function over partition by year, sex
.
Mary
in US Data
Plot the proportion of female births given name ‘MARIE’ or ‘MARIE-…’ as a fucntion of year. Proceed in such a way that the reader can see the share of compounded names. We are expecting an area plot
Have a look at r-graph-gallery: stacked area and at ggplot documentation. Pay attention on the way you stack the area corresponding to names matching pattern ’MARIE-.*’ over or under the are corresponding to babies named ‘MARIE’
theme_set(theme_minimal())
<- function(data, .name_stem='MARIE', .sex='F'){
share_name %>%
data ::filter(sex==.sex, !is.na(year)) %>%
dplyrselect(-sex) %>%
group_by(year) %>%
summarize(strict=sum(ifelse(name==.name_stem, n, 0)),
loose=sum(ifelse(stringr::str_starts(name, glue('{.name_stem}-')), n, 0)),
total=sum(n)) %>%
transmute(year=year,
strict=strict/total,
loose=loose/total) %>%
pivot_longer(strict:loose,
names_to=c("set"),
values_to="share") %>%
mutate(set=factor(set,
levels=c("loose", "strict"),
ordered=TRUE))
}
<- function(df, .name_stem = "MARIE", .sex = "F"){
decline_and_fall
<- share_name(df, .name_stem, .sex)
df <- max(pull(df, share), na.rm = T)
maxshare
<- df |>
p ggplot(aes(x=year)) +
geom_area(aes(y=share,
fill=set),
position="stack") +
ylab("share") +
annotate('text',
label="1st WW",
x = 1916,
y=0.1*maxshare) +
annotate('text',
label="2nd WW",
x = 1942,
y=0.1*maxshare) +
annotate("text",
label= "1969",
x= 1968,
y= 0.5*maxshare) +
theme_minimal()
for(y in c(1914, 1918, 1938, 1945, 1958, 1969)) {
<- p + geom_vline(xintercept = y, linetype="dotted")
p
}
p
}
decline_and_fall(df_fr, .name_stem = "MARIE", .sex="F")
See Graphique 3, page 48, de L’archipel français de J. Fourquet. Le Seuil. Essais. Vol. 898.
Answer the same question for JEANNE and FRANCE
<- decline_and_fall(df_fr, "JEANNE")
p_jeanne # p_jeanne
<- decline_and_fall(df_fr, "FRANCE")
p_france
# p_france
<- p_jeanne / p_france
patchw
+ plot_annotation(
patchw title="Decline of classic names",
subtitle="Jeanne and France"
)
Plot the popularities of KEVIN
, ENZO
, STÉPHANE
as a function of year
.
<- rename(df_accounted_births_fr, total=n) df_accounted_births_fr
<- c("STÉPHANE", "KEVIN", "ENZO")
prenoms
|>
df_fr filter(year>1947) |>
filter(name %in% prenoms, sex=="M") %>%
inner_join(df_accounted_births_fr, by=c("year", "sex")) %>%
ggplot() +
aes(x=year, y=n/total, linetype=name) +
geom_line() +
scale_y_log10() +
ggtitle(glue("Popularities"))
We can investigate surges of popularity for female English names in the way we did for male French names.
<- c('Jessica', 'Jennifer', 'Dana', 'Monica', 'Laura')
hypenames
%>%
(babynames filter(year > 1947) |>
filter(name %in% hypenames, sex=='F') %>%
inner_join(babynames::births, by=c("year")) %>%
ggplot() +
aes(x=year, y=2*n/births, linetype=name, colour=name) +
geom_line() +
scale_y_log10() +
ggtitle(glue("Popularities"))) |>
::ggplotly() plotly
Plot the popularities of “JEAN”, “LUC”, “MATHIEU”, “MARC”, “PAUL”, “PIERRE”, “JOSEPH”, “FRANÇOIS” as a function of year
. Use stacked area style plot.
<- c("JEAN", "LUC", "MATHIEU", "MARC", "PAUL", "PIERRE", "JOSEPH", "FRANÇOIS")
prenoms
%>%
df_fr filter(year>1947) |>
filter(name %in% prenoms, sex=="M") %>%
mutate(name= as_factor(name)) %>%
mutate(name= fct_rev(name)) %>%
inner_join(df_accounted_births_fr, by=c("year", "sex")) %>%
ggplot() +
aes(x=year, y=n/total, linetype=name, fill=name) +
scale_fill_viridis_d() +
geom_area(position = "stack") +
# scale_y_log10() +
ggtitle(glue("Popularities of Boy Names"))
Plot the popularities of “JEAN”, “LUC”, “MATHIEU”, “MARC”, “PAUL”, “PIERRE”, “JOSEPH”, “FRANÇOIS” as a function of year
. Use line plot.
<- (df_fr %>%
q filter(year > 1947) |>
filter(name %in% prenoms, sex=="M") %>%
inner_join(df_accounted_births_fr, by=c("year", "sex")) %>%
ggplot() +
aes(x=year, y=n/total, linetype=name, colour=name) +
geom_line() +
scale_y_log10() +
ggtitle(glue("Popularities"))) |>
::ggplotly()
plotly
q
Look for the translation of these names in US Data
<- str_to_title(c("JOHN", "LUKE", "MATTHEW", "MARK", "PAUL", "PETER", "JOSEPH", "FRANCIS"))
firstnames
%>%
(babynames filter(year > 1947) |>
filter(name %in% firstnames, sex=='M') %>%
inner_join(babynames::births, by=c("year")) %>%
ggplot() +
aes(x=year, y=2*n/births, linetype=name, colour=name) +
geom_line() +
scale_y_log10() +
ggtitle(glue("Popularities"))) |>
::ggplotly() plotly
The variations of popularity exhibit different patterns