Notebook

"Infographics Challenge, Economic Implications of COVID-19"

First submission notebook

Lucas BL

In [136]:
import pandas as pd
import numpy as np
import empyrical as ep
import alphalens as al

from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.factset import GeoRev
from quantopian.pipeline.domain import US_EQUITIES
from quantopian.research import run_pipeline
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.data.factset import RBICSFocus

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns; sns.set(style="whitegrid")

from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.vq import kmeans,vq
from sklearn.cluster import KMeans
import statsmodels
import statsmodels.api as sm
In [137]:
# Load COVID-19 data. We periodically update the file in your research
# folder. You can also download an updated file from 
# https://ourworldindata.org/coronavirus-source-data and put 
# it into your data directory in research
covid = local_csv('covid19_cases.csv', date_column='date')
covid.head()
Out[137]:
location new_cases new_deaths total_cases total_deaths
date
2019-12-31 00:00:00+00:00 Afghanistan 0 0 0 0
2020-01-01 00:00:00+00:00 Afghanistan 0 0 0 0
2020-01-02 00:00:00+00:00 Afghanistan 0 0 0 0
2020-01-03 00:00:00+00:00 Afghanistan 0 0 0 0
2020-01-04 00:00:00+00:00 Afghanistan 0 0 0 0

Definition of more specific economic areas

In [138]:
# Map selected countries to their region
region_country = {
    'United States': 'North America',
    'Canada': 'North America',
    'Austria': 'East-Europe',
    'Bulgaria': 'East-Europe',
    'Croatia': 'East-Europe',
    'Denmark': 'West-Europe',
    'Estonia': 'East-Europe',
    'Finland': 'Scandinavian_Countries',
    'France': 'West-Europe',
    'Germany': 'West-Europe',
    'Greece': 'East-Europe',
    'Iceland': 'Scandinavian_Countries',
    'Hungary': 'East-Europe',
    'Italy': 'West-Europe',
    'Netherlands': 'West-Europe',
    'Poland': 'East-Europe',
    'Portugal': 'West-Europe',    
    'Romania': 'East-Europe',
    'Spain': 'West-Europe',
    'Serbia': 'East-Europe',    
    'Switzerland': 'West-Europe',
    'Sweden': 'Scandinavian_Countries',
    'United Kingdom': 'West-Europe',
    'Australia': 'Asia-Pacific',
    'Bangladesh': 'Asia-Pacific',
    'Bhutan': 'Asia-Pacific',
    'China': 'Asia-Pacific',
    'Indonesia': 'Asia-Pacific',
    'Japan': 'Asia-Pacific',
    'Malaysia': 'Asia-Pacific',
    'Myanmar': 'Asia-Pacific',
    'New Zealand': 'Asia-Pacific',
    'Singapore': 'Asia-Pacific',
    'South Korea': 'Asia-Pacific',
    'Taiwan': 'Asia-Pacific',
    'Vietnam': 'Asia-Pacific',
}

covid['region'] = covid['location'].map(region_country)

covid_region = covid.reset_index().groupby(['date', 'region']).sum().reset_index(level='region')
In [139]:
covid_region.head()
Out[139]:
region new_cases new_deaths total_cases total_deaths
date
2019-12-31 00:00:00+00:00 Asia-Pacific 27 0 27 0
2019-12-31 00:00:00+00:00 East-Europe 0 0 0 0
2019-12-31 00:00:00+00:00 North America 0 0 0 0
2019-12-31 00:00:00+00:00 Scandinavian_Countries 0 0 0 0
2019-12-31 00:00:00+00:00 West-Europe 0 0 0 0
In [140]:
df = covid_region.reset_index()
region_totcases = pd.DataFrame()

for name, group in df.groupby(df.iloc[:,1]):
    if region_totcases.empty:
        region_totcases = group.set_index("date")[["total_cases"]].rename(columns={"total_cases":name})
    else:
        region_totcases = region_totcases.join(group.set_index("date")[["total_cases"]]).rename(columns={"total_cases":name})

print(region_totcases.head())

region_totcases.pct_change().rolling(1).mean().cumsum().plot(legend=True)
plt.ylabel('# Total cases') 
plt.title('COVID-19 growth in different regions') 
plt.legend(loc=0)

plt.figure()
sns.clustermap(region_totcases.corr(), vmin=-1, vmax=1, center=0, linewidths=.75, cmap='coolwarm')
                           Asia-Pacific  East-Europe  North America  \
date                                                                  
2019-12-31 00:00:00+00:00            27            0              0   
2020-01-01 00:00:00+00:00            27            0              0   
2020-01-02 00:00:00+00:00            27            0              0   
2020-01-03 00:00:00+00:00            44            0              0   
2020-01-04 00:00:00+00:00            44            0              0   

                           Scandinavian_Countries  West-Europe  
date                                                            
2019-12-31 00:00:00+00:00                       0            0  
2020-01-01 00:00:00+00:00                       0            0  
2020-01-02 00:00:00+00:00                       0            0  
2020-01-03 00:00:00+00:00                       0            0  
2020-01-04 00:00:00+00:00                       0            0  
Out[140]:
<seaborn.matrix.ClusterGrid at 0x7f5ce38fa390>
<matplotlib.figure.Figure at 0x7f5ce776aac8>

In the graph of the total cases above we can see that the number of total cases are increasing in a differnet timespan, reflecting the interconnectiveness of specific areas. Moreover, specific countries disease prevention may influence the delay of observation as well as consumer behavior

In [141]:
def return_dataframe(data, iloc_column, variable):
    dataframe = data.reset_index()
    new_dataframe = pd.DataFrame()
    for name, group in dataframe.groupby(dataframe.iloc[:,iloc_column]):
        if new_dataframe.empty:
            new_dataframe = group.set_index("date")[[variable]].rename(columns={variable:name})
        else:
            new_dataframe = new_dataframe.join(group.set_index("date")[[variable]]).rename(columns={variable:name})
    
    return new_dataframe
In [142]:
covid_df = return_dataframe(covid.iloc[:,:], 1, "total_cases")
print('Rows in the dataframe= ', len(covid_df))
# covid_df.tail()
Rows in the dataframe=  88
In [143]:
covid_df = covid_df.dropna(thresh=len(covid_df)*0.8, axis=1, how='all').iloc[30:-1,:] 
#Drop January since there is a lot of missing data and reporting errors

Second derivative of delta to map the velocity (speed) of the epidemic withinh the area

In [144]:
sns.clustermap(covid_df.pct_change().corr(method='pearson'), vmin=-1, vmax=1, center=0, cmap='coolwarm')
Out[144]:
<seaborn.matrix.ClusterGrid at 0x7f5ce112b080>
In [145]:
thresh = 10000
covid_region.groupby('region')['total_cases'].plot(legend=True);
plt.axhline(thresh, ls='--', color='0.5', label='Chosen threshold');
plt.ylabel('# confirmed cases'); plt.title('COVID-19 growth in different regions'); plt.legend(loc=0);
In [146]:
# Compute date where threshold was crossed in each region
covid_region_date = covid_region.loc[lambda x: x.total_cases > thresh].reset_index().groupby('region').first()['date'].sort_values()
covid_region_date

# Compute date where threshold was crossed in each countries
covid_location_date = covid.loc[lambda x: x.total_cases > thresh].reset_index().groupby('location').first()['date'].sort_values()
covid_location_date
Out[146]:
location
World            2020-02-01 00:00:00+00:00
China            2020-02-01 00:00:00+00:00
Italy            2020-03-11 00:00:00+00:00
Iran             2020-03-13 00:00:00+00:00
Spain            2020-03-18 00:00:00+00:00
France           2020-03-20 00:00:00+00:00
Germany          2020-03-20 00:00:00+00:00
United States    2020-03-20 00:00:00+00:00
Switzerland      2020-03-27 00:00:00+00:00
United Kingdom   2020-03-27 00:00:00+00:00
Belgium          2020-03-30 00:00:00+00:00
Netherlands      2020-03-30 00:00:00+00:00
Turkey           2020-03-31 00:00:00+00:00
Austria          2020-04-01 00:00:00+00:00
Canada           2020-04-03 00:00:00+00:00
South Korea      2020-04-03 00:00:00+00:00
Portugal         2020-04-05 00:00:00+00:00
Brazil           2020-04-05 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]
In [147]:
# Revenue exposure to North America, Asia / Pacific and Europe.
GeoRevNA = GeoRev.slice('NORTH AMERICA')
GeoRevAP = GeoRev.slice('ASIA-PACIFIC')
GeoRevEU = GeoRev.slice('EUROPE')

# Most recent revenue exposure.
rev_exposure_NA = GeoRevNA.est_pct.latest
rev_exposure_AP = GeoRevAP.est_pct.latest
rev_exposure_EU = GeoRevEU.est_pct.latest

# We are not using RBICS sectors for this analysis
# but putting it here in case you want to use it.
sector = RBICSFocus.l1_name.latest

# Add all factors to a pipeline and run it.
pipe = Pipeline(
    columns={
        'rev_exposure_NA': rev_exposure_NA.rank().zscore(),
        'rev_exposure_AP': rev_exposure_AP.rank().zscore(),
        'rev_exposure_EU': rev_exposure_EU.rank().zscore(),
        'sector': sector,
    },
    domain=US_EQUITIES,
    screen=(QTradableStocksUS() & rev_exposure_NA.notnull() & rev_exposure_AP.notnull() & rev_exposure_EU.notnull()),
)

# Run the pipeline for the most recent date available, georev is only updated yearly
df = run_pipeline(pipe, '2018-01-01', '2018-01-01')
df = df.reset_index(level=0, drop=True) # drop date index as it's just a single day
print(df.head())

Pipeline Execution Time: 3.21 Seconds
                   rev_exposure_AP  rev_exposure_EU  rev_exposure_NA  \
Equity(2 [HWM])          -0.431457         0.978334        -0.803243   
Equity(24 [AAPL])         1.142255         0.564753        -1.286297   
Equity(31 [ABAX])        -0.596019         0.466349        -0.510013   
Equity(52 [ABM])         -1.355218        -1.339148        -0.151785   
Equity(53 [ABMD])        -0.969395        -0.111239        -0.238942   

                              sector  
Equity(2 [HWM])          Industrials  
Equity(24 [AAPL])         Technology  
Equity(31 [ABAX])         Healthcare  
Equity(52 [ABM])   Business Services  
Equity(53 [ABMD])         Healthcare  
In [148]:
df = df.drop('sector', axis='columns')
In [149]:
df.head()
Out[149]:
rev_exposure_AP rev_exposure_EU rev_exposure_NA
Equity(2 [HWM]) -0.431457 0.978334 -0.803243
Equity(24 [AAPL]) 1.142255 0.564753 -1.286297
Equity(31 [ABAX]) -0.596019 0.466349 -0.510013
Equity(52 [ABM]) -1.355218 -1.339148 -0.151785
Equity(53 [ABMD]) -0.969395 -0.111239 -0.238942
In [150]:
# Get stock returns
prices = get_pricing(
  symbols=df.index,
  start_date=covid.index[0],
  end_date=covid.index[-1], 
  fields='close_price',
)
In [151]:
# Add datetime index with forward filling to match prices
# This effectively forward-fills the 2018 data
factor = pd.concat({dt: df for dt in prices.index})
factor.head()
Out[151]:
rev_exposure_AP rev_exposure_EU rev_exposure_NA
2019-12-31 00:00:00+00:00 Equity(2 [HWM]) -0.431457 0.978334 -0.803243
Equity(24 [AAPL]) 1.142255 0.564753 -1.286297
Equity(31 [ABAX]) -0.596019 0.466349 -0.510013
Equity(52 [ABM]) -1.355218 -1.339148 -0.151785
Equity(53 [ABMD]) -0.969395 -0.111239 -0.238942
In [152]:
# Use alphalens to compute factor returns for each column
factor_returns = {}
for col in factor.columns:
    factor_data = al.utils.get_clean_factor_and_forward_returns(
        factor[col], prices, periods=[1])
    factor_returns[col] = al.performance.factor_returns(factor_data)['1D']

factor_returns = pd.DataFrame(factor_returns)

factor_returns.head()
Dropped 8.9% entries from factor data: 8.9% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Dropped 8.9% entries from factor data: 8.9% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Dropped 8.9% entries from factor data: 8.9% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Out[152]:
rev_exposure_AP rev_exposure_EU rev_exposure_NA
date
2019-12-31 00:00:00+00:00 0.005583 0.002659 -0.005307
2020-01-02 00:00:00+00:00 -0.002485 -0.003441 0.003255
2020-01-03 00:00:00+00:00 -0.001496 -0.000700 0.001221
2020-01-06 00:00:00+00:00 0.003154 0.001807 -0.003991
2020-01-07 00:00:00+00:00 0.001746 0.002252 0.000951
In [153]:
factor_returns.columns = ['Asia/Pacific factor returns', 'Europe factor returns', 'North America factor returns']

Elbow curve using K-Mean Clsutering

map "relevant" cluster number regarding possible noises in the data (Highest convexity is generally the appropriate number)

In [154]:
#Calculate average annual percentage return and volatilities over a theoretical one year period
delta_df = covid_df.pct_change().rolling(1).mean().mean(axis=1)
delta_df = pd.DataFrame(delta_df).dropna()
delta_df.columns = ['delta_df']
delta_df['Volatility'] = delta_df.rolling(2).std().mean(axis=1)
delta_df = delta_df.pct_change().dropna()

# #format the data as a numpy array to feed into the K-Means algorithm
X = np.asarray([np.asarray(delta_df['delta_df']),np.asarray(delta_df['Volatility'])]).T

distorsions = []
for k in range(2, 40):
    k_means = KMeans(n_clusters=k)
    k_means.fit(X)
    distorsions.append(k_means.inertia_)

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 40), distorsions, c='b')
plt.grid(True)
plt.title('Elbow curve')
Out[154]:
<matplotlib.text.Text at 0x7f5cab77ef60>
In [168]:
def hrc_dendrogram(data, thresh=None, ax=None, figsize=None, scale=None):
    #transpose the dataframe
    x = data.corr(method='spearman')
    # Normalize the movements: normalized_movements
    normalized_movements = normalize(x)
    # Calculate the linkage: mergings
    mergings = linkage(normalized_movements, method='ward')
    # print(mergings)
    global labels
    global nbofclusters
    labels = fcluster(mergings, thresh, criterion='distance')
    labels = pd.DataFrame({'Cluster':labels, 'Tickers':x.index}).set_index('Tickers')
    nbofclusters = labels.max()
    # Plot the dendrogram
    labelList = [i for i in x.index]
#     plt.figure(figsize=figsize)
#     plt.yscale(scale)
    dendrogram(mergings,
            truncate_mode='level',
            color_threshold=thresh,
            leaf_rotation=90.,
            leaf_font_size=15.,
            labels=labelList,
            show_contracted=True, ax=ax)
#     plt.suptitle('Hiearchical Clustering Dendrogram', fontsize=15); plt.ylabel('height', fontsize=15);
#     plt.show()
    
hrc_dendrogram(covid_df.pct_change(), thresh=1.6, scale='linear', figsize=(20,3))

It appear that some relationship exist (at least at soem extent) in the velocity of the spread of the COVID19 in some contries, it mostly map the behavior of the epidemy in each country that are influenced by soem factors such as, social-distancing, sanitary control, prevention, etc. There is error rates since the data are only based on total cases reported

In [156]:
# Define the clusters according to the same metrics as above i.e. "distance" for euclidian
# Nb of clusters > arbitrary decision
# Plot Distance (euclidean) Classification classification
model = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
model.fit(covid_df.corr())
labels = model.labels_
cluster = pd.DataFrame()
# covid_df.mean(axis=1).cumsum().plot(color='b', alpha=0.5, title="Industries classification Hiearchical Clustering (12)")
for i in range(labels.max()+1):
    cluster['cluster_{}'.format(i)] = covid_df.T[labels==i].T.mean(axis=1)
    cluster['cluster_{}'.format(i)].diff(1).rolling(1).mean().cumsum().plot(alpha=1, legend=True)

plt.show()

Here we can clearly spot China as the Cluster_3 other reflect eurozone and North america zones, further improvements could map those clusters with already defined economic regions

In [169]:
# General Parameters for Tearsheet
fig, axs = plt.subplots(nrows=5, sharex=False, figsize=(20,35));

#Regional plots
region_totcases.pct_change().rolling(1).mean().cumsum().plot(legend=True, ax=axs[0])
plt.legend(loc=0);
axs[0].set(title='Cumulative changes  in total cases',
           ylabel='Cumulative changes', xlabel='');

# Plot of Factor returns 
ep.cum_returns(factor_returns).plot(ax=axs[1])
colors = ['b', 'g', 'r', 'brown', 'gray', 'orange', 'black', 'cyan']
axs[1].set(title='GeoRev-weighted factor returns in response to COVID-19 cases',
           ylabel='Cumulative returns',xlabel='');

# Plot of Factor returns volatility
factor_returns.rolling(5).std(axis=1).plot(ax=axs[2])  
axs[2].set(title='GeoRev-weighted Weekly Standard deviation of mean factor returns in response to COVID-19',
           ylabel='Standard Deviation',xlabel='');

# Plot Factor spread
(factor_returns.rolling(5).mean().max(axis=1) - factor_returns.rolling(5).mean().min(axis=1)).plot(ax=axs[3], label='Spread changes')
for c, (region, date) in zip(colors, covid_location_date.iteritems()):
    axs[3].axvline(date, label=region, ls='--', color=c)  
axs[3].legend(loc=2); 
axs[3].set(title='Weekly spread of factor returns | bars countries crossing {} cases'.format(thresh),
           ylabel='Spread (High-Low)',xlabel='');

# fig, axs = plt.subplots(nrows=2, sharex=False)
hrc_dendrogram(covid_df**2, thresh=0.07, ax=axs[4])
axs[4].set(title='Hiearchical Clustering Dendrogram',
           ylabel='Height', yscale='linear');