# Copyright 2020 Thibaut Boissin # # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated # documentation files (the "Software"), to deal in the Software without restriction, including without limitation the # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all copies or substantial portions of # the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import numpy as np import pandas as pd # list of exported columns col_of_interrest = ['dep', 'jour', 'sexe', 'hosp', 'rea', 'rad', 'dc', 'hosp_cum', 'rea_cum', 'rad_cum', 'dc_cum', ] # first auto download the latest version of the dataset # tested under linux (do windows has curl command ? os.system("curl -o donnes-hospitalieres-covid19-latest.csv -L https://www.data.gouv.fr/fr/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7") df = pd.read_csv("donnes-hospitalieres-covid19-latest.csv", sep=";", parse_dates=['jour']) # column name harmonization: columns ending with _cum are cumulative, while others are daily df[['rad_cum', 'dc_cum']] = df[['rad', 'dc']] df[['rad', 'dc']] = df.groupby(['dep', 'sexe'])[['rad', 'dc']].diff() df[['hosp_cum', 'rea_cum']] = df.groupby(['dep', 'sexe'])[['hosp', 'rea']].cumsum() ############################################################### # rule 1: cumulative number must not decrease: ############################################################### # adds two columns depending on the erroneous variable df['flag_rad'] = df['rad'] < 0 df['flag_dc'] = df['dc'] < 0 # export flagged rows in invalid_cumultive_values.csv invalid_cumulation = df.query('flag_rad!=0 or flag_dc!=0') invalid_cumulation.to_csv('invalid_cumultive_values.csv', index=False) print("%.3f percent of the rows are flagged because of invalid cumulative value" % ( float(len(invalid_cumulation))*100 / float(len(df)))) ############################################################### # rule 2: numbers for each date/dep numbers for sexe=0 must be the sum of sexe=1 and sexe=2 ############################################################### # compute values for sexe=0 - ( values for sexe=1 + sexe=2 ) discrepancies_sexe = df.query('sexe!=0').groupby(['dep', 'jour'])['hosp', 'rea', 'rad_cum', 'dc_cum'].sum() - \ df.query('sexe==0').set_index(['dep', 'jour'])[['hosp', 'rea', 'rad_cum', 'dc_cum']] invalid_sex_sum = discrepancies_sexe[np.abs(discrepancies_sexe).sum(axis=1) != 0] # store flagged ( dep, jour ) blocks in invalid_sexe_sum.csv invalid_sex_sum.to_csv('invalid_sexe_sum.csv', index=False) print("%.3f percent of the (dep, jour) are flagged because of invalid summation over sexe column" % ( float(len(invalid_sex_sum))*100 / float(len(df.query('sexe==0'))))) ############################################################### # rule 3: every dc or rad must have been in hosp or rea before ############################################################### # compute values difference between hosp_cum + rea_cum - dc_cum df['hosp_cum_plus_rea_cum_minus_dc_cum'] = (df[df.sexe != 0].hosp_cum + df[df.sexe != 0].rea_cum) - ( df[df.sexe != 0].dc_cum) invalid_state = df[df['hosp_cum_plus_rea_cum_minus_dc_cum'] < 0][ col_of_interrest + ['hosp_cum_plus_rea_cum_minus_dc_cum']] invalid_state.to_csv('invalid_status.csv', index=False) print("%.3f percent of the rows violate the fact that deceased people must have been counted as hosp or rea previously" % ( float(len(invalid_state))*100 / float(len(df.query('sexe!=0'))))) ################################################## # bonus: question, are rea also counted in hosp ? ################################################## flagged_bonus = df[df.sexe != 0].hosp - df[df.sexe != 0].rea < 0 print("%f percent of the row disproves that rea are also counted as hosp" % ( float(flagged_bonus.sum())*100 / float(len(df[df.sexe != 0])))) # df.describe().to_csv()