Today we will make the first chart from the article The unique US failure to control the virus

import pandas as pd
import altair as alt
from functools import wraps
import datetime as dt

alt.renderers.set_embed_options(actions=False)

population_uri = 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv?raw=true'
deaths_ts_uri = 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv?raw=true'
gdp_current_us_dollars = 'https://gist.githubusercontent.com/armsp/58b43f28b4bf880f3874db80630dec44/raw/959a34a1797b0e3fdc860a6ef0057c62ee898dd7/gdp.csv'

deaths_ts = pd.read_csv(deaths_ts_uri)
deaths_ts.head()

gdp_us = pd.read_csv(gdp_current_us_dollars)
gdp_us.head()

population = pd.read_csv(population_uri)
population.head()

def log_step(func):
    @wraps(func)
    def wrapper(dataf, *args, **kwargs):
        """timing or logging etc"""
        start = dt.datetime.now()
        output = func(dataf, *args, **kwargs)
        end = dt.datetime.now()
        print(f"After function {func.__name__} ran, shape of dataframe is - {output.shape}, execution time is - {end-start}")
        return output
    return wrapper

@log_step
def start_pipeline(dataf):
    return dataf.copy()

@log_step
def remove_cols(dataf, *arg, **kwargs):
    #print(list(arg))
    result = dataf.drop(columns=list(arg))
    return result 

@log_step
def remove_null(dataf):
    return dataf.dropna()

def rename_cols(dataf, *arg, **kwargs):
    """Rename column names of raw dataframes to something digestable and that looks better in visualization and does not have spaces in between cause altair does not like that"""
    result = dataf.rename(columns=kwargs)
    return result

@log_step
def filter_rows(dataf, which, **kwargs):
    if which == 'gdp':
        result = dataf[dataf['current_us'] != '..']
        return result
    elif which == 'pop':
        result = dataf[pd.isnull(dataf['Province_State'])]
        return result

def set_dtypes(dataf):
    """set the datatypes of columns"""
    # can use data.assign(col = lambda d: pd.to_datetime(d['col'])) or col = pd.to_datetime(d['col'])
    dataf['current_us'] = dataf['current_us'].astype(float)
    return dataf

# def remove_outliers(dataf):
#     """remove outliers"""
#     return dataf

# def add_features(dataf):
#     return dataf

@log_step
def clean(dataf):
    agg_deaths = dataf.groupby('Country/Region').sum().reset_index()
    agg_deaths = agg_deaths[agg_deaths['Country/Region'].isin(pop_w_gdp['Country/Region'])].set_index('Country/Region')
    result = agg_deaths.T.reset_index().rename_axis(None, axis=1).rename(columns={'index': 'Date'})
    result['Date'] = pd.to_datetime(result['Date'], format="%m/%d/%y")
    result = result[result['Date'] < '8/5/20']

    #convert cumulative deaths to daily deaths per million
    for col in result:
        if col != 'Date':
            result[col] = result[col].diff()
            result[col] = (result[col]/int(countries_population[countries_population['Country/Region'] == col]['Population']))*1000000
    return result

gdp = (gdp_us
       .pipe(start_pipeline)
       .pipe(remove_null)
       .pipe(remove_cols, *['Series Name', 'Series Code'])
       .pipe(rename_cols, **{'2019 [YR2019]': 'current_us'})
       .pipe(filter_rows, 'gdp')
       .pipe(set_dtypes))

countries_population = (population
       .pipe(start_pipeline)
       .pipe(filter_rows, 'pop')
       .pipe(remove_cols, *['UID', 'iso2', 'code3', 'FIPS', 'Admin2', 'Province_State', 'Lat', 'Long_', 'Combined_Key'])
       .pipe(rename_cols, **{'iso3': 'Country Code', 'Country_Region':'Country/Region'}))

# Combining population with GDP
pop_w_gdp = countries_population.merge(gdp, how='inner', on='Country Code')

# Filter for only wealthy countries i.e GDP > 25000 USD and population > 10 million
pop_w_gdp = pop_w_gdp[(pop_w_gdp['current_us'] > 25000) & (pop_w_gdp['Population'] > 10000000)]

# Making daily deaths per million data for plotting
plot_data = (deaths_ts
             .pipe(start_pipeline)
             .pipe(remove_cols, *['Province/State', 'Lat', 'Long'])
             .pipe(clean)
             .pipe(remove_null)
            )

plot_data.head()

After function start_pipeline ran, shape of dataframe is - (269, 5), execution time is - 0:00:00.000376
After function remove_null ran, shape of dataframe is - (264, 5), execution time is - 0:00:00.182202
After function remove_cols ran, shape of dataframe is - (264, 3), execution time is - 0:00:00.073348
After function filter_rows ran, shape of dataframe is - (222, 3), execution time is - 0:00:00.005289
After function start_pipeline ran, shape of dataframe is - (4154, 12), execution time is - 0:00:00.006278
After function filter_rows ran, shape of dataframe is - (189, 12), execution time is - 0:00:00.012600
After function remove_cols ran, shape of dataframe is - (189, 3), execution time is - 0:00:00.005748
After function start_pipeline ran, shape of dataframe is - (267, 282), execution time is - 0:00:00.000790
After function remove_cols ran, shape of dataframe is - (267, 279), execution time is - 0:00:00.003718
After function clean ran, shape of dataframe is - (196, 14), execution time is - 0:00:00.212112
After function remove_null ran, shape of dataframe is - (195, 14), execution time is - 0:00:00.004697

	Province/State	Country/Region	Lat	Long	...	10/16/20	10/17/20	10/18/20	10/19/20	10/20/20	10/21/20	10/22/20	10/23/20	10/24/20	10/25/20
0	NaN	Afghanistan	33.93911	67.709953	...	1485	1488	1492	1497	1499	1501	1505	1507	1511	1514
1	NaN	Albania	41.15330	20.168300	...	443	448	451	454	458	462	465	469	473	477
2	NaN	Algeria	28.03390	1.659600	...	1841	1846	1856	1865	1873	1880	1888	1897	1907	1914
3	NaN	Andorra	42.50630	1.521800	...	59	59	59	62	62	63	63	69	69	69
4	NaN	Angola	-11.20270	17.873900	...	234	241	247	248	251	255	260	265	267	268

	Series Name	Series Code	Country Name	Country Code	2019 [YR2019]
0	GDP per capita (current US$)	NY.GDP.PCAP.CD	Afghanistan	AFG	502.115486913067
1	GDP per capita (current US$)	NY.GDP.PCAP.CD	Albania	ALB	5352.85741103671
2	GDP per capita (current US$)	NY.GDP.PCAP.CD	Algeria	DZA	3948.34327892571
3	GDP per capita (current US$)	NY.GDP.PCAP.CD	American Samoa	ASM	..
4	GDP per capita (current US$)	NY.GDP.PCAP.CD	Andorra	AND	40886.3911648431

	UID	iso2	iso3	code3	FIPS	Admin2	Province_State	Country_Region	Lat	Long_	Combined_Key	Population
0	4	AF	AFG	4.0	NaN	NaN	NaN	Afghanistan	33.93911	67.709953	Afghanistan	38928341.0
1	8	AL	ALB	8.0	NaN	NaN	NaN	Albania	41.15330	20.168300	Albania	2877800.0
2	12	DZ	DZA	12.0	NaN	NaN	NaN	Algeria	28.03390	1.659600	Algeria	43851043.0
3	20	AD	AND	20.0	NaN	NaN	NaN	Andorra	42.50630	1.521800	Andorra	77265.0
4	24	AO	AGO	24.0	NaN	NaN	NaN	Angola	-11.20270	17.873900	Angola	32866268.0

US's failure to control the virus

This is an interactive plot. Hover over the plot to see lines getting highlighted.

	Date	Australia	Belgium	Canada	France	Germany	Italy	Japan	Korea, South	Netherlands	Spain	Sweden	US	United Kingdom
1	2020-01-23	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	2020-01-24	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	2020-01-25	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	2020-01-26	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	2020-01-27	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0