Code de reproduction, à partir d'un dump de tables_index avec :
COPY ( SELECT DISTINCT ON (resource_id) * FROM tables_index WHERE csv_detective::text LIKE '%code_commune%' ORDER BY resource_id, created_at) TO '/tmp/tables_index_communes.csv' WITH ( FORMAT CSV, HEADER TRUE );
import json
import pandas as pd
import requests
from datetime import datetime
from datagouv import Client
tables_index = pd.read_csv(
"tables_index_communes.csv",
usecols=["resource_id", "csv_detective"],
dtype=str,
)
tables_index["csv_detective"] = tables_index["csv_detective"].apply(json.loads)
def extract_col_commune(inspection: dict[str, dict]):
for col, detected in inspection["columns"].items():
if detected["format"].startswith("code_commune"):
return col
return None
def extract_summable_col(inspection):
for col, detected in inspection["columns"].items():
if detected["python_type"] in ["int", "float"]:
return col
return None
tables_index["col_commune"] = tables_index["csv_detective"].apply(extract_col_commune)
tables_index["summable_col"] = tables_index["csv_detective"].apply(extract_summable_col)
tables_index["nb_rows"] = tables_index["csv_detective"].apply(lambda d: d.get("total_lines"))
restr = tables_index.loc[
(tables_index["col_commune"].notna())
& (tables_index["summable_col"].notna()),
["resource_id", "col_commune", "summable_col", "nb_rows"]
].sort_values(by="nb_rows", ascending=False).reset_index(drop=True)
catalog = pd.read_parquet(
"https://hydra.s3.rbx.io.cloud.ovh.net/parquet/4babf5f2-6a9c-45b5-9144-ca5eae6a7a6d.parquet",
columns=["dataset.id", "dataset.title", "id", "title", "preview_url", "modified"],
).rename({"id": "resource_id"}, axis=1)
details = pd.merge(
restr,
catalog,
on="resource_id",
how="left",
).dropna(subset=["preview_url", "dataset.id"])
details = details.loc[details["modified"] >= datetime(2024, 1, 1).date()]
final = details.loc[details["nb_rows"] > 10000]
# pas fait parce que trop lent, pour récupérer pour chaque ressource, la liste des communes référencées dans le fichier
client = Client()
def get_scoped_communes(row):
res = client.get_all_from_api_query(
# we need to perform an aggregation operation for the groupby to work
(
"https://tabular-api.data.gouv.fr/api/resources/"
f"{row['resource_id']}/data/?"
f"{row['col_commune']}__isnotnull&{row['col_commune']}__groupby"
f"&{row['summable_col']}__sum"
f"&page_size=200"
),
_ignore_base_url=True,
next_page="links.next",
)
return res
list(get_scoped_communes(final.iloc[0]))
...
Vues
0
Téléchargements
0
Documentation des fichiers manquante
Couverture temporelle non renseignée
Il n'y a pas encore de discussion