import polars as pl
import json
import altair as alt
babies = pl.read_csv("https://github.com/Keybby/igr-baby-names/raw/refs/heads/main/data/dataset.csv", schema_overrides={"dpt": pl.String})
is_rare = pl.col("name") == "_PRENOMS_RARES"
is_corsica = pl.col("dpt") == "20"
is_dom_tom = pl.col("dpt").str.starts_with("97")
is_male = pl.col("sexe") == "M"
is_female = pl.col("sexe") == "F"
# import geometric data
url_geojson = "https://france-geojson.gregoiredavid.fr/repo/departements.geojson"
geodata = alt.Data(url=url_geojson, format=alt.DataFormat(property="features"))
centers = pl.read_csv("https://raw.githubusercontent.com/Keybby/igr-baby-names/refs/heads/main/data/dpt_positions.csv")
babies = (
babies
# remove rare nouns
.filter(~is_rare)
.filter(pl.col("count").sum().over("name") >= 100)
# remove empty cells in the data
.drop_nulls()
.filter(pl.col("count").sum().over("name") >= 100)
.with_columns(
pl.col("year").floordiv(10).mul(10).alias("decade"),
pl.col("year").floordiv(3).mul(3).alias("year3"),
pl.col("year").floordiv(30).mul(30).alias("year30"),
)
)
babies
shape: (3_611_256, 8)
| sexe | dpt | year | name | count | decade | year3 | year30 |
|---|---|---|---|---|---|---|---|
| str | str | i64 | str | i64 | i64 | i64 | i64 |
| "M" | "75" | 1962 | "AARON" | 3 | 1960 | 1962 | 1950 |
| "M" | "75" | 1976 | "AARON" | 3 | 1970 | 1974 | 1950 |
| "M" | "75" | 1982 | "AARON" | 3 | 1980 | 1980 | 1980 |
| "M" | "75" | 1984 | "AARON" | 3 | 1980 | 1983 | 1980 |
| "M" | "75" | 1985 | "AARON" | 5 | 1980 | 1983 | 1980 |
| … | … | … | … | … | … | … | … |
| "F" | "59" | 1928 | "ZULMA" | 9 | 1920 | 1926 | 1920 |
| "F" | "62" | 1928 | "ZULMA" | 3 | 1920 | 1926 | 1920 |
| "F" | "62" | 1930 | "ZULMA" | 4 | 1930 | 1929 | 1920 |
| "F" | "59" | 1931 | "ZULMA" | 5 | 1930 | 1929 | 1920 |
| "F" | "62" | 1933 | "ZULMA" | 5 | 1930 | 1932 | 1920 |
Main visualisation
n_names = 20
relative = False
if not relative:
name_dpt_yr = (
babies.group_by(["name", "dpt", "year3"])
.agg(pl.col("count").sum())
.with_columns(
(pl.col("count") / pl.col("count").sum().over(["year3"])).alias(
"count_percent"
)
)
.filter(pl.sum("count").over("name").rank("dense", descending=True) <= n_names)
)
else:
name_dpt_yr = (
babies.filter(
pl.sum("count").over("name").rank("dense", descending=True) <= n_names
)
.group_by(["name", "dpt", "year3"])
.agg(pl.col("count").sum())
.with_columns(
(pl.col("count") / pl.col("count").sum().over(["year3"])).alias(
"count_percent"
)
)
)
if not relative:
accu = name_dpt_yr.group_by(["year3", "name"]).agg(pl.col("count_percent").sum())
else:
accu = name_dpt_yr_pre.group_by(["year3", "name"]).agg(
pl.col("count_percent").sum()
)
custom_colors = [
"#e6194b",
"#3cb44b",
"#ffe119",
"#4363d8",
"#f58231",
"#911eb4",
"#46f0f0",
"#f032e6",
"#bcf60c",
"#fabebe",
"#008080",
"#e6beff",
"#9a6324",
"#fffac8",
"#800000",
"#aaffc3",
"#808000",
"#ffd8b1",
"#000075",
"#808080",
"#ffffff",
"#000000",
"#a9a9a9",
"#00ff00",
"#ff00ff",
]
aggre = (
alt.Chart(accu, width=800, height=400, title="Évolution des prénoms par date")
.mark_bar(width=15)
.encode(
x=alt.X("year3:Q").title("Année (aggrégée)"),
y=alt.Y("count_percent:Q").title("Proportion des Naissances"),
color=alt.Color(
"name",
# ,scale=alt.Scale(scheme=alt.SequentialMultiHue())
scale=alt.Scale(range=custom_colors),
).legend(columns=2),
tooltip=[
alt.Tooltip("year3", title="Année"),
alt.Tooltip("name", title="Nom"),
alt.Tooltip("count_percent:Q", title="Naissances"),
],
)
)
aggre
click_dpt = alt.selection_point(fields=["dpt"])
pop_selection = alt.selection_interval(encodings=["x"])
selection = alt.selection_point(fields=["name"])
color = alt.condition(
selection, alt.Color("name:N", legend=None), alt.value("lightgray")
)
make = pl.DataFrame({"name": name_dpt_yr.select("name").unique().to_series().to_list()})
make_selector = (
alt.Chart(make)
.mark_rect()
.encode(y=alt.Y("name", title="Noms (à sélectionner)"), color=color)
.add_params(selection)
.add_params(pop_selection)
)
population = (
alt.Chart(
name_dpt_yr, width=800, height=300, title="Évolution des prénoms par date"
)
.mark_bar(width=15)
.encode(
x=alt.X("year3").title("Année (aggrégée)"),
y=alt.Y("count_percent:Q").title("Proportion des Naissances"),
color=alt.Color(
"name",
# ,scale=alt.Scale(scheme=alt.SequentialMultiHue())
scale=alt.Scale(range=custom_colors),
).legend(columns=2),
tooltip=[
alt.Tooltip("year3", title="Année"),
alt.Tooltip("dpt", title="Département"),
alt.Tooltip("name", title="Nom"),
alt.Tooltip("count:Q", title="Naissances"),
],
)
.add_params(pop_selection)
.add_params(selection)
.transform_filter(click_dpt)
.transform_filter(selection)
)
alt.data_transformers.disable_max_rows()
map = (
alt.Chart(name_dpt_yr, width=800)
.mark_geoshape()
.transform_lookup(
lookup="dpt", from_=alt.LookupData(geodata, "properties.code"), as_="geo"
)
.transform_filter(selection)
.encode(
color=alt.Color("sum(count):Q").title("Naissances locales"),
shape="geo:G",
tooltip=[
alt.Tooltip("dpt:N", title="Département"),
alt.Tooltip("sum(count):Q", title="Naissances"),
],
stroke=alt.condition(
click_dpt, # Highlight the selected department
alt.value("black"), # Stroke color for the selected department
alt.value("transparent"),
),
)
.add_params(click_dpt)
.transform_filter(pop_selection)
)
make_selector | (population & map)
Diversity and big names
diversity = (
babies.filter(pl.col("decade") != 2020, ~is_corsica, ~is_dom_tom)
.group_by("dpt", "sexe", "decade", "name")
.agg(pl.col("count").sum().alias("births"))
.with_columns(
[
# Calculate total births per (department, sex, decade, noun) if needed
pl.col("births").sum().over("sexe", "decade").alias("total_births_decade"),
pl.col("births")
.sum()
.over("sexe", "decade", "dpt")
.alias("total_births_dpt"),
pl.col("births")
.sum()
.over("sexe", "decade", "name")
.alias("births_decade"),
pl.col("births")
.rank(method="dense", descending=True)
.over(["sexe", "decade", "dpt"])
.alias("rank_dpt"),
]
)
.with_columns(
[
# Mark if name is in top 50
(pl.col("rank_dpt") <= 5).alias("is_top5"),
]
)
.group_by(["dpt", "sexe", "decade"])
.agg(
[
pl.col("total_births_dpt").first(),
# Sum births for top 50 names
pl.col("births").filter(pl.col("is_top5")).sum().alias("top5_births"),
# Get total births (same for all rows in group, so first() works)
pl.col("total_births_decade").first().alias("total_births"),
pl.col("name")
.filter(pl.col("is_top5"))
.unique()
.head()
.str.join(", ")
.alias("top_noun"),
]
)
.with_columns(
[
# Calculate proportion
(pl.col("top5_births") / pl.col("total_births_dpt")).alias("top5_score"),
]
)
)
data = (
alt.Chart(
diversity, title="Proportion des prénoms dans le top5 de leur département"
)
.transform_lookup(
lookup="dpt", from_=alt.LookupData(geodata, "properties.code"), as_="geo"
)
.transform_lookup(
lookup="dpt",
from_=alt.LookupData(centers, "dpt", ["lon", "lat"]),
)
)
fg = data.mark_circle().encode(
alt.Latitude("lat:Q"),
alt.Longitude("lon:Q"),
alt.Size("top5_score:Q"),
alt.Order("sexe"),
alt.Column("decade"),
alt.Row("sexe"),
alt.Fill("sexe"),
alt.Color("top5_score:Q"),
alt.OpacityValue(0.5),
tooltip=[
alt.Tooltip("dpt"),
alt.Tooltip("top_noun"),
alt.Tooltip("top5_score").format("0.0%"),
alt.Tooltip("total_births_dpt"),
],
)
fg
Baptemes
baptems_yr = pl.read_csv("https://github.com/Keybby/igr-baby-names/raw/refs/heads/main/data/baptemes.csv")
noms_cathos = [
"MARIE",
"CHRISTIAN",
"CATHERINE",
"JOSEPH",
"PIERRE",
"PAUL",
"DOMINIQUE",
"MICHEL",
"JACQUES",
"JEAN",
"JEANNE",
]
extras = [] # ["ANDRÉ","BERNARD","GEORGES"]
noms_cathos += extras
cathos_yr = (
# first, remove rare nouns
babies.filter(pl.col("name").is_in(noms_cathos))
.filter(pl.col("year") >= 2000)
.group_by(["year"])
.agg(pl.col("count").sum())
)
cathos_yr
joined = cathos_yr.join(baptems_yr, "year")
joined
shape: (8, 3)
| year | count | baptems |
|---|---|---|
| i64 | i64 | i64 |
| 2016 | 8668 | 249921 |
| 2010 | 9679 | 302941 |
| 2000 | 18533 | 400327 |
| 2017 | 8275 | 231165 |
| 2018 | 7856 | 215551 |
| 2015 | 9136 | 262314 |
| 2019 | 7514 | 204304 |
| 2020 | 6710 | 112123 |
base = alt.Chart(
joined,
width=800,
height=200,
title="Évolution des prénoms à consonnance catholique / Nombre de baptèmes",
).encode(alt.X("year:T").title("Année"))
histogram_maries = base.mark_line(color="#57A44C").encode(
y=alt.Y("count:Q").axis(
title="Naissances",
titleColor="#57A44C",
domainColor="#57A44C",
labelColor="#57A44C",
),
# color = alt.Color('name'),
# tooltip = [alt.Tooltip('name', title='Name'), alt.Tooltip('count:Q', title='Count')],
)
histogram_baptems = base.mark_line(color="#4C7EA4").encode(
y=alt.Y("baptems:Q").axis(
title="Baptèmes",
titleColor="#4C7EA4",
domainColor="#4C7EA4",
labelColor="#4C7EA4",
),
# color = alt.Color('name'),
# tooltip = [alt.Tooltip('name', title='Name'), alt.Tooltip('count:Q', title='Count')],
)
alt.layer(histogram_baptems, histogram_maries).resolve_scale(y="independent")
Sexe
neutral_names = (
babies.group_by("name")
.agg(
pl.col("count").sum(),
(
pl.col("count").filter(is_male).sum()
/ pl.col("count").filter(is_female).sum()
)
.log10()
.alias("sex_score"),
)
.filter(pl.col("sex_score").is_between(-1, 1))
.sort("count", descending=True)
.head(6)
)
neutral_babies = (
babies.join(neutral_names, on="name")
.group_by("year30", "name", "dpt")
.agg(
pl.sum("count").log().alias("lc"),
pl.sum("count"),
(pl.col("count").filter(is_male).sum() / pl.col("count").sum()).alias(
"male_proportion"
),
)
)
neutral_babies
shape: (2_046, 6)
| year30 | name | dpt | lc | count | male_proportion |
|---|---|---|---|---|---|
| i64 | str | str | f64 | i64 | f64 |
| 1980 | "ANDRÉA" | "56" | 5.187386 | 179 | 0.055866 |
| 1890 | "ANDRÉA" | "17" | 5.590987 | 268 | 0.0 |
| 1920 | "DOMINIQUE" | "86" | 5.880533 | 358 | 0.712291 |
| 1980 | "NOA" | "39" | 4.26268 | 71 | 1.0 |
| 1890 | "ANDRÉA" | "40" | 4.691348 | 109 | 0.0 |
| … | … | … | … | … | … |
| 2010 | "CAMILLE" | "72" | 6.095825 | 444 | 0.254505 |
| 1950 | "ALIX" | "76" | 2.833213 | 17 | 0.0 |
| 1920 | "DOMINIQUE" | "42" | 5.874931 | 356 | 0.514045 |
| 1950 | "CAMILLE" | "26" | 2.484907 | 12 | 0.25 |
| 2010 | "NOA" | "46" | 2.197225 | 9 | 1.0 |
data = (
alt.Chart(neutral_babies, title="Evolution of neutral names")
.transform_lookup(
lookup="dpt", from_=alt.LookupData(geodata, "properties.code"), as_="geo"
)
.transform_lookup(
lookup="dpt",
from_=alt.LookupData(centers, "dpt", ["lon", "lat"]),
)
)
chart = data.mark_geoshape().encode(
alt.Shape("geo:G"),
alt.Color("male_proportion:Q"),
alt.Tooltip(["male_proportion", "count"]),
alt.Column("year30"),
alt.Row("name"),
opacity="lc",
)
chart