data.gouv.fr

Code de reproduction, à partir d'un dump de tables_index avec :

COPY ( SELECT DISTINCT ON (resource_id) * FROM tables_index WHERE csv_detective::text LIKE '%code_commune%' ORDER BY resource_id, created_at) TO '/tmp/tables_index_communes.csv' WITH ( FORMAT CSV, HEADER TRUE );

import json
import pandas as pd
import requests
from datetime import datetime
from datagouv import Client

tables_index = pd.read_csv(
    "tables_index_communes.csv",
    usecols=["resource_id", "csv_detective"],
    dtype=str,
)
tables_index["csv_detective"] = tables_index["csv_detective"].apply(json.loads)

def extract_col_commune(inspection: dict[str, dict]):
    for col, detected in inspection["columns"].items():
        if detected["format"].startswith("code_commune"):
            return col
    return None

def extract_summable_col(inspection):
    for col, detected in inspection["columns"].items():
        if detected["python_type"] in ["int", "float"]:
            return col
    return None


tables_index["col_commune"] = tables_index["csv_detective"].apply(extract_col_commune)
tables_index["summable_col"] = tables_index["csv_detective"].apply(extract_summable_col)
tables_index["nb_rows"] = tables_index["csv_detective"].apply(lambda d: d.get("total_lines"))

restr = tables_index.loc[
    (tables_index["col_commune"].notna())
    & (tables_index["summable_col"].notna()),
    ["resource_id", "col_commune", "summable_col", "nb_rows"]
].sort_values(by="nb_rows", ascending=False).reset_index(drop=True)

catalog = pd.read_parquet(
    "https://hydra.s3.rbx.io.cloud.ovh.net/parquet/4babf5f2-6a9c-45b5-9144-ca5eae6a7a6d.parquet",
    columns=["dataset.id", "dataset.title", "id", "title", "preview_url", "modified"],
).rename({"id": "resource_id"}, axis=1)

details = pd.merge(
    restr,
    catalog,
    on="resource_id",
    how="left",
).dropna(subset=["preview_url", "dataset.id"])
details = details.loc[details["modified"] >= datetime(2024, 1, 1).date()]

final = details.loc[details["nb_rows"] > 10000]

# pas fait parce que trop lent, pour récupérer pour chaque ressource, la liste des communes référencées dans le fichier
client = Client()

def get_scoped_communes(row):
    res = client.get_all_from_api_query(
        # we need to perform an aggregation operation for the groupby to work 
        (
            "https://tabular-api.data.gouv.fr/api/resources/"
             f"{row['resource_id']}/data/?"
            f"{row['col_commune']}__isnotnull&{row['col_commune']}__groupby"
            f"&{row['summable_col']}__sum"
            f"&page_size=200"
        ),
        _ignore_base_url=True,
        next_page="links.next",
    )
    return res

list(get_scoped_communes(final.iloc[0]))
...

[Hackathon IGN] Données tabulaires avec code commune

Fichiers ⁽¹⁾

Réutilisations et API ⁽¹⁾

Discussions ⁽⁰⁾

Ressources communautaires

Informations

0 discussions

[Hackathon IGN] Données tabulaires avec code commune

Fichiers (1)

Réutilisations et API (1)

Discussions (0)

Ressources communautaires

Informations

0 discussions

Fichiers ⁽¹⁾

Réutilisations et API ⁽¹⁾

Discussions ⁽⁰⁾