import polars as pl
import json
import altair as alt
babies = pl.read_csv("https://github.com/Keybby/igr-baby-names/raw/refs/heads/main/data/dataset.csv", schema_overrides={"dpt": pl.String})
is_rare = pl.col("name") == "_PRENOMS_RARES"
is_corsica = pl.col("dpt") == "20"
is_dom_tom = pl.col("dpt").str.starts_with("97")
is_male = pl.col("sexe") == "M"
is_female = pl.col("sexe") == "F"
# import geometric data
url_geojson = "https://france-geojson.gregoiredavid.fr/repo/departements.geojson"
geodata = alt.Data(url=url_geojson, format=alt.DataFormat(property="features"))
centers = pl.read_csv("https://raw.githubusercontent.com/Keybby/igr-baby-names/refs/heads/main/data/dpt_positions.csv")
babies = (
    babies
    # remove rare nouns
    .filter(~is_rare)
    .filter(pl.col("count").sum().over("name") >= 100)
    # remove empty cells in the data
    .drop_nulls()
    .filter(pl.col("count").sum().over("name") >= 100)
    .with_columns(
        pl.col("year").floordiv(10).mul(10).alias("decade"),
        pl.col("year").floordiv(3).mul(3).alias("year3"),
        pl.col("year").floordiv(30).mul(30).alias("year30"),
    )
)
babies
shape: (3_611_256, 8)
sexedptyearnamecountdecadeyear3year30
strstri64stri64i64i64i64
"M""75"1962"AARON"3196019621950
"M""75"1976"AARON"3197019741950
"M""75"1982"AARON"3198019801980
"M""75"1984"AARON"3198019831980
"M""75"1985"AARON"5198019831980
"F""59"1928"ZULMA"9192019261920
"F""62"1928"ZULMA"3192019261920
"F""62"1930"ZULMA"4193019291920
"F""59"1931"ZULMA"5193019291920
"F""62"1933"ZULMA"5193019321920

Main visualisation

n_names = 20
relative = False

if not relative:
    name_dpt_yr = (
        babies.group_by(["name", "dpt", "year3"])
        .agg(pl.col("count").sum())
        .with_columns(
            (pl.col("count") / pl.col("count").sum().over(["year3"])).alias(
                "count_percent"
            )
        )
        .filter(pl.sum("count").over("name").rank("dense", descending=True) <= n_names)
    )

else:
    name_dpt_yr = (
        babies.filter(
            pl.sum("count").over("name").rank("dense", descending=True) <= n_names
        )
        .group_by(["name", "dpt", "year3"])
        .agg(pl.col("count").sum())
        .with_columns(
            (pl.col("count") / pl.col("count").sum().over(["year3"])).alias(
                "count_percent"
            )
        )
    )

if not relative:
    accu = name_dpt_yr.group_by(["year3", "name"]).agg(pl.col("count_percent").sum())
else:
    accu = name_dpt_yr_pre.group_by(["year3", "name"]).agg(
        pl.col("count_percent").sum()
    )
custom_colors = [
    "#e6194b",
    "#3cb44b",
    "#ffe119",
    "#4363d8",
    "#f58231",
    "#911eb4",
    "#46f0f0",
    "#f032e6",
    "#bcf60c",
    "#fabebe",
    "#008080",
    "#e6beff",
    "#9a6324",
    "#fffac8",
    "#800000",
    "#aaffc3",
    "#808000",
    "#ffd8b1",
    "#000075",
    "#808080",
    "#ffffff",
    "#000000",
    "#a9a9a9",
    "#00ff00",
    "#ff00ff",
]

aggre = (
    alt.Chart(accu, width=800, height=400, title="Évolution des prénoms par date")
    .mark_bar(width=15)
    .encode(
        x=alt.X("year3:Q").title("Année (aggrégée)"),
        y=alt.Y("count_percent:Q").title("Proportion des Naissances"),
        color=alt.Color(
            "name",
            #   ,scale=alt.Scale(scheme=alt.SequentialMultiHue())
            scale=alt.Scale(range=custom_colors),
        ).legend(columns=2),
        tooltip=[
            alt.Tooltip("year3", title="Année"),
            alt.Tooltip("name", title="Nom"),
            alt.Tooltip("count_percent:Q", title="Naissances"),
        ],
    )
)
aggre
click_dpt = alt.selection_point(fields=["dpt"])
pop_selection = alt.selection_interval(encodings=["x"])
selection = alt.selection_point(fields=["name"])

color = alt.condition(
    selection, alt.Color("name:N", legend=None), alt.value("lightgray")
)

make = pl.DataFrame({"name": name_dpt_yr.select("name").unique().to_series().to_list()})
make_selector = (
    alt.Chart(make)
    .mark_rect()
    .encode(y=alt.Y("name", title="Noms (à sélectionner)"), color=color)
    .add_params(selection)
    .add_params(pop_selection)
)
population = (
    alt.Chart(
        name_dpt_yr, width=800, height=300, title="Évolution des prénoms par date"
    )
    .mark_bar(width=15)
    .encode(
        x=alt.X("year3").title("Année (aggrégée)"),
        y=alt.Y("count_percent:Q").title("Proportion des Naissances"),
        color=alt.Color(
            "name",
            #   ,scale=alt.Scale(scheme=alt.SequentialMultiHue())
            scale=alt.Scale(range=custom_colors),
        ).legend(columns=2),
        tooltip=[
            alt.Tooltip("year3", title="Année"),
            alt.Tooltip("dpt", title="Département"),
            alt.Tooltip("name", title="Nom"),
            alt.Tooltip("count:Q", title="Naissances"),
        ],
    )
    .add_params(pop_selection)
    .add_params(selection)
    .transform_filter(click_dpt)
    .transform_filter(selection)
)

alt.data_transformers.disable_max_rows()
map = (
    alt.Chart(name_dpt_yr, width=800)
    .mark_geoshape()
    .transform_lookup(
        lookup="dpt", from_=alt.LookupData(geodata, "properties.code"), as_="geo"
    )
    .transform_filter(selection)
    .encode(
        color=alt.Color("sum(count):Q").title("Naissances locales"),
        shape="geo:G",
        tooltip=[
            alt.Tooltip("dpt:N", title="Département"),
            alt.Tooltip("sum(count):Q", title="Naissances"),
        ],
        stroke=alt.condition(
            click_dpt,  # Highlight the selected department
            alt.value("black"),  # Stroke color for the selected department
            alt.value("transparent"),
        ),
    )
    .add_params(click_dpt)
    .transform_filter(pop_selection)
)
make_selector | (population & map)

Diversity and big names

diversity = (
    babies.filter(pl.col("decade") != 2020, ~is_corsica, ~is_dom_tom)
    .group_by("dpt", "sexe", "decade", "name")
    .agg(pl.col("count").sum().alias("births"))
    .with_columns(
        [
            # Calculate total births per (department, sex, decade, noun) if needed
            pl.col("births").sum().over("sexe", "decade").alias("total_births_decade"),
            pl.col("births")
            .sum()
            .over("sexe", "decade", "dpt")
            .alias("total_births_dpt"),
            pl.col("births")
            .sum()
            .over("sexe", "decade", "name")
            .alias("births_decade"),
            pl.col("births")
            .rank(method="dense", descending=True)
            .over(["sexe", "decade", "dpt"])
            .alias("rank_dpt"),
        ]
    )
    .with_columns(
        [
            # Mark if name is in top 50
            (pl.col("rank_dpt") <= 5).alias("is_top5"),
        ]
    )
    .group_by(["dpt", "sexe", "decade"])
    .agg(
        [
            pl.col("total_births_dpt").first(),
            # Sum births for top 50 names
            pl.col("births").filter(pl.col("is_top5")).sum().alias("top5_births"),
            # Get total births (same for all rows in group, so first() works)
            pl.col("total_births_decade").first().alias("total_births"),
            pl.col("name")
            .filter(pl.col("is_top5"))
            .unique()
            .head()
            .str.join(", ")
            .alias("top_noun"),
        ]
    )
    .with_columns(
        [
            # Calculate proportion
            (pl.col("top5_births") / pl.col("total_births_dpt")).alias("top5_score"),
        ]
    )
)
data = (
    alt.Chart(
        diversity, title="Proportion des prénoms dans le top5 de leur département"
    )
    .transform_lookup(
        lookup="dpt", from_=alt.LookupData(geodata, "properties.code"), as_="geo"
    )
    .transform_lookup(
        lookup="dpt",
        from_=alt.LookupData(centers, "dpt", ["lon", "lat"]),
    )
)


fg = data.mark_circle().encode(
    alt.Latitude("lat:Q"),
    alt.Longitude("lon:Q"),
    alt.Size("top5_score:Q"),
    alt.Order("sexe"),
    alt.Column("decade"),
    alt.Row("sexe"),
    alt.Fill("sexe"),
    alt.Color("top5_score:Q"),
    alt.OpacityValue(0.5),
    tooltip=[
        alt.Tooltip("dpt"),
        alt.Tooltip("top_noun"),
        alt.Tooltip("top5_score").format("0.0%"),
        alt.Tooltip("total_births_dpt"),
    ],
)

fg

Baptemes

baptems_yr = pl.read_csv("https://github.com/Keybby/igr-baby-names/raw/refs/heads/main/data/baptemes.csv")
noms_cathos = [
    "MARIE",
    "CHRISTIAN",
    "CATHERINE",
    "JOSEPH",
    "PIERRE",
    "PAUL",
    "DOMINIQUE",
    "MICHEL",
    "JACQUES",
    "JEAN",
    "JEANNE",
]
extras = []  # ["ANDRÉ","BERNARD","GEORGES"]
noms_cathos += extras

cathos_yr = (
    # first, remove rare nouns
    babies.filter(pl.col("name").is_in(noms_cathos))
    .filter(pl.col("year") >= 2000)
    .group_by(["year"])
    .agg(pl.col("count").sum())
)
cathos_yr
joined = cathos_yr.join(baptems_yr, "year")
joined
shape: (8, 3)
yearcountbaptems
i64i64i64
20168668249921
20109679302941
200018533400327
20178275231165
20187856215551
20159136262314
20197514204304
20206710112123
base = alt.Chart(
    joined,
    width=800,
    height=200,
    title="Évolution des prénoms à consonnance catholique / Nombre de baptèmes",
).encode(alt.X("year:T").title("Année"))

histogram_maries = base.mark_line(color="#57A44C").encode(
    y=alt.Y("count:Q").axis(
        title="Naissances",
        titleColor="#57A44C",
        domainColor="#57A44C",
        labelColor="#57A44C",
    ),
    # color = alt.Color('name'),
    # tooltip = [alt.Tooltip('name', title='Name'), alt.Tooltip('count:Q', title='Count')],
)

histogram_baptems = base.mark_line(color="#4C7EA4").encode(
    y=alt.Y("baptems:Q").axis(
        title="Baptèmes",
        titleColor="#4C7EA4",
        domainColor="#4C7EA4",
        labelColor="#4C7EA4",
    ),
    # color = alt.Color('name'),
    # tooltip = [alt.Tooltip('name', title='Name'), alt.Tooltip('count:Q', title='Count')],
)

alt.layer(histogram_baptems, histogram_maries).resolve_scale(y="independent")

Sexe

neutral_names = (
    babies.group_by("name")
    .agg(
        pl.col("count").sum(),
        (
            pl.col("count").filter(is_male).sum()
            / pl.col("count").filter(is_female).sum()
        )
        .log10()
        .alias("sex_score"),
    )
    .filter(pl.col("sex_score").is_between(-1, 1))
    .sort("count", descending=True)
    .head(6)
)
neutral_babies = (
    babies.join(neutral_names, on="name")
    .group_by("year30", "name", "dpt")
    .agg(
        pl.sum("count").log().alias("lc"),
        pl.sum("count"),
        (pl.col("count").filter(is_male).sum() / pl.col("count").sum()).alias(
            "male_proportion"
        ),
    )
)
neutral_babies
shape: (2_046, 6)
year30namedptlccountmale_proportion
i64strstrf64i64f64
1980"ANDRÉA""56"5.1873861790.055866
1890"ANDRÉA""17"5.5909872680.0
1920"DOMINIQUE""86"5.8805333580.712291
1980"NOA""39"4.26268711.0
1890"ANDRÉA""40"4.6913481090.0
2010"CAMILLE""72"6.0958254440.254505
1950"ALIX""76"2.833213170.0
1920"DOMINIQUE""42"5.8749313560.514045
1950"CAMILLE""26"2.484907120.25
2010"NOA""46"2.19722591.0
data = (
    alt.Chart(neutral_babies, title="Evolution of neutral names")
    .transform_lookup(
        lookup="dpt", from_=alt.LookupData(geodata, "properties.code"), as_="geo"
    )
    .transform_lookup(
        lookup="dpt",
        from_=alt.LookupData(centers, "dpt", ["lon", "lat"]),
    )
)
chart = data.mark_geoshape().encode(
    alt.Shape("geo:G"),
    alt.Color("male_proportion:Q"),
    alt.Tooltip(["male_proportion", "count"]),
    alt.Column("year30"),
    alt.Row("name"),
    opacity="lc",
)
chart