# Copyright 2020 Thibaut Boissin
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
# the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os

import numpy as np
import pandas as pd

# list of exported columns
col_of_interrest = ['dep', 'jour', 'sexe', 'hosp', 'rea', 'rad', 'dc', 'hosp_cum', 'rea_cum', 'rad_cum', 'dc_cum', ]

# first auto download the latest version of the dataset
# tested under linux (do windows has curl command ?
os.system("curl -o donnes-hospitalieres-covid19-latest.csv -L https://www.data.gouv.fr/fr/datasets/r/63352e38-d353-4b54-bfd1-f1b3ee1cabd7")
df = pd.read_csv("donnes-hospitalieres-covid19-latest.csv", sep=";", parse_dates=['jour'])

# column name harmonization: columns ending with _cum are cumulative, while others are daily
df[['rad_cum', 'dc_cum']] = df[['rad', 'dc']]
df[['rad', 'dc']] = df.groupby(['dep', 'sexe'])[['rad', 'dc']].diff()
df[['hosp_cum', 'rea_cum']] = df.groupby(['dep', 'sexe'])[['hosp', 'rea']].cumsum()

###############################################################
# rule 1: cumulative number must not decrease:
###############################################################
# adds two columns depending on the erroneous variable
df['flag_rad'] = df['rad'] < 0
df['flag_dc'] = df['dc'] < 0

# export flagged rows in invalid_cumultive_values.csv
invalid_cumulation = df.query('flag_rad!=0 or flag_dc!=0')
invalid_cumulation.to_csv('invalid_cumultive_values.csv', index=False)
print("%.3f percent of the rows are flagged because of invalid cumulative value" % (
            float(len(invalid_cumulation))*100 / float(len(df))))

###############################################################
# rule 2: numbers for each date/dep numbers for sexe=0 must be the sum of sexe=1 and sexe=2
###############################################################
# compute values for sexe=0 - ( values for sexe=1 + sexe=2 )
discrepancies_sexe = df.query('sexe!=0').groupby(['dep', 'jour'])['hosp', 'rea', 'rad_cum', 'dc_cum'].sum() - \
                     df.query('sexe==0').set_index(['dep', 'jour'])[['hosp', 'rea', 'rad_cum', 'dc_cum']]
invalid_sex_sum = discrepancies_sexe[np.abs(discrepancies_sexe).sum(axis=1) != 0]
# store flagged ( dep, jour ) blocks in invalid_sexe_sum.csv
invalid_sex_sum.to_csv('invalid_sexe_sum.csv', index=False)
print("%.3f percent of the (dep, jour) are flagged because of invalid summation over sexe column" % (
            float(len(invalid_sex_sum))*100 / float(len(df.query('sexe==0')))))

###############################################################
# rule 3: every dc or rad must have been in hosp or rea before
###############################################################
# compute values difference between hosp_cum + rea_cum - dc_cum
df['hosp_cum_plus_rea_cum_minus_dc_cum'] = (df[df.sexe != 0].hosp_cum + df[df.sexe != 0].rea_cum) - (
    df[df.sexe != 0].dc_cum)
invalid_state = df[df['hosp_cum_plus_rea_cum_minus_dc_cum'] < 0][
    col_of_interrest + ['hosp_cum_plus_rea_cum_minus_dc_cum']]
invalid_state.to_csv('invalid_status.csv', index=False)
print("%.3f percent of the rows violate the fact that deceased people must have been counted as hosp or rea previously" % (
            float(len(invalid_state))*100 / float(len(df.query('sexe!=0')))))

##################################################
# bonus: question, are rea also counted in hosp ?
##################################################
flagged_bonus = df[df.sexe != 0].hosp - df[df.sexe != 0].rea < 0
print("%f percent of the row disproves that rea are also counted as hosp" % (
            float(flagged_bonus.sum())*100 / float(len(df[df.sexe != 0]))))

# df.describe().to_csv()