x1 = 3 # numbers
x2 = 13.3
str1 =  "hi" # strings
group = [1,2,3,4,5,6,7,8]# lists
nums = {"int":x1 , "float":x2 } # dictionaries
a = [1, 2, 3, 4, 5, 6, 7, 8]
# also tuples
b = (1,2,3,4)
# and of course there is some fancy stuff
[3*element for element in a if element%2==0]

[6, 12, 18, 24]


print(nums["int"])

3


if x1<x2:
    print("The integer is bigger.")
else:
    print("The float is bigger.")
    x1

The integer is bigger.


for key, value  in nums.items():
    print(f"Key {key} is associated with value {value}.")

Key int is associated with value 3.
Key float is associated with value 13.3.


def weightedMean(nums,wts):
    """
    Computes the weighted mean. 
    First argument is the array and the second are the weights
    """
    assert len(nums)==len(wts), "The arrays given do not have the same length!" 
    total = 0
    totalWts = 0
    for i in range(len(nums)):
        total += nums[i]*wts[i]
        totalWts += wts[i]
    return total/totalWts


weightedMean([10,20],[1,2])

16.666666666666668


import pandas as pd # show: pandas cheat sheet
import numpy as np


import eurostat


### directly creating a dataframe works with:
# pd.DataFrame([1,2,3,4,5,6,7,8,9]) # and there can be more options
### laoding from a file one can use:
# df = pd.read_csv("")# pd.read_excel() or pd.read_stata()
## Here I will use data that I download from the web. In particular, from Eurostat
# get info here: https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing
df0 = eurostat.get_data_df("ei_cphi_m",flags=False) # "ei_cphi_m" this is the code for the HICP
df=df0


# head and tail
df.sample(20)
# look at the column names
df.columns
# what are these other columns?
# df["indic"].unique()

Index(['unit', 's_adj', 'indic', 'geo\time', '2022M04', '2022M03', '2022M02',
       '2022M01', '2021M12', '2021M11',
       ...
       '1996M10', '1996M09', '1996M08', '1996M07', '1996M06', '1996M05',
       '1996M04', '1996M03', '1996M02', '1996M01'],
      dtype='object', length=320)


# it is similar to the dictionary notation
# each column is an entry to the dictionary and the key is the column's name
df["geo\\time"]
# then each column can be thought as a list (it is actually a pandas series) 
# and we can get to the elements by indices
df["geo\\time"][1]
# but pandas offers more
# multiple colums
df[["geo\\time"]]
# iloc
df.iloc[0:20,9:13]
# loc
df.loc[df["geo\\time"]=="DE"]
# this also gives us a way to filter and only get the options that we are interested for "unit" and "indic"
df = df.loc[((df["unit"]=="HICP2015")&(df["indic"]=="CP-HI00"))]
# this is also a way to rearrange or delete columns
df[["2021M01","2022M01" ]]
df


df = df.melt(id_vars=["unit","s_adj","indic","geo\\time"])
df
# I also drop the values that are NA
df = df.dropna()


# there are a lot of functions: sum, count, median, quantile, min, max, mean, var, std etc.
total = df["value"].sum()
total
# we can also get the main statistics of the data with
df["value"].describe()

count    12379.000000
mean        87.920125
std         19.172478
min          1.200000
25%         76.895000
50%         91.700000
75%        100.450000
max        346.790000
Name: value, dtype: float64


# the apply function
def skewness(col):
    avg = col.mean()
    # avg = sum(col)/len(col)
    n = len(col)
    return sum((col-avg)**3)/n/(sum((col-avg)**2)/n)**(3/2)
# ((n-1)*n)**(1/2)/(n-2)
df[["value"]].apply(skewness)

value   -0.045051
dtype: float64


df.skew()

value   -0.045056
dtype: float64

df


df = df.rename(columns={"variable":"date","value":"hicp","geo\\time":"country"})
# create a new column by assigning a column into a column of the dataframe 
# with a new name
df["year"] = df["date"].str.slice(0,4).astype(int)
df["month"] = df["date"].str.slice(5,7).astype(int)

# we can create lagged variables with shift
df["hicp"].shift(1)
# also arithmetic operations between columns
df["hicp"].shift(1)* df["hicp"]

# but there are other transformations also, cumsum, cumprod, fillna, isnull etc
df["hicp"].cumsum()

0            118.75
1            239.36
2            360.48
3            463.63
4            573.01
            ...    
12950    1088207.34
12951    1088251.22
12952    1088294.23
12953    1088295.43
12954    1088363.23
Name: hicp, Length: 12379, dtype: float64


df[["hicp"]].astype("int16")
# this can also change the size of the dataframe
df[["hicp"]].astype("int16").info()
df[["hicp"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12379 entries, 0 to 12954
Data columns (total 1 columns):
hicp    12379 non-null int16
dtypes: int16(1)
memory usage: 120.9 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12379 entries, 0 to 12954
Data columns (total 1 columns):
hicp    12379 non-null float64
dtypes: float64(1)
memory usage: 193.4 KB


# here I download another dataset
df02 = eurostat.get_data_df("ei_lmhr_m",flags=False) # this is the code for unemployment
df2 = df02
df2=df2.loc[((df2["indic"]=="LM-UN-T-TOT")&(df2["s_adj"]=="SA"))]


df2
# df2["s_adj"].unique()
# df2["indic"].unique()
# df2["unit"].unique()


# append one frame to another by using pd.concat method

# merge dataframes
# inner keeps the rows that have common values across the datasets merged
# left keeps the rows of the first database
# right keeps the rows of the second database
# outer keeps the rows of both datasets


# I do the same reshaping as before
df2 = df2.melt(id_vars=["unit","s_adj","indic","geo\\time"])
df2 = df2.dropna()
df2 = df2.rename(columns={"variable":"date","value":"unemp","geo\\time":"country"})

# and now I can merge the dataframes
df3 = pd.merge(df[["country","date","year","month","hicp"]], df2[["country","date","unemp"]],on=["country","date"])
df3


# after applying groupby we should use a function or a transformation 
# for example sum function
first_year = df3.groupby("country")["year"].min()
first_year
# we may also want to add a column to a frame that contains the result of a calculation by groups
# so, we do the calculation and then we merge it back to the dataframe
df3  = pd.merge(df3,first_year, on="country", how="left").drop_duplicates().rename(columns={"year_x":"year","year_y":"first_year"})
# we can also apply a transformation after the groupby, then the grouping column is deleted and only the 
# transofrmation is returned
df3["inflation"] = df3["hicp"]-df3.groupby("country")["hicp"].shift(1)

# finally I also define lag inflation
df3["lag_inflation"]=df3.groupby("country")["inflation"].shift(1)
df3


df3.to_csv("files/infl_unemp.csv")


# from sklearn.linear_model import LinearRegression # standard regression library
import statsmodels.api as sm


df3 = df3.dropna()
df3["const"] = 1
model = sm.OLS(df3["unemp"], df3[["const","inflation","lag_inflation"]])
results = model.fit()


# results.summary() gives the standard info about the regression
results.summary()
# and there is a lot more information (https://www.statsmodels.org/stable/index.html)
print(results.HC0_se)
print(results.bse)

const            0.046410
inflation        0.088461
lag_inflation    0.084309
dtype: float64
const            0.043763
inflation        0.069335
lag_inflation    0.067207
dtype: float64

df3


##########################################
# Fama French 3 Factors                  #
# Qingyi (Freda) Song Drechsler          #
# Date: April 2018                       #
# Updated: June 2020                     #
##########################################

import pandas as pd
import numpy as np
import datetime as dt
import wrds
#import psycopg2 
import matplotlib.pyplot as plt
from dateutil.relativedelta import *
from pandas.tseries.offsets import *
from scipy import stats


###################
# Connect to WRDS #01/01/2022
###################
conn=wrds.Connection()

WRDS recommends setting up a .pgpass file.

You can create this file yourself at any time
with the create_pgpass_file() function.
Loading library list...
Done


# this can be helpful in order to avoide downloading the data multiple times.
# this library can create a file that remembers the output of the function given an 
# input. If the same is input is provided later it gives the stored output,
# if a different input is provided then it lets the original function run properly.
from joblib import Memory
# wrap function using joblib
temp_dir = './cache'
mem1 = Memory(temp_dir)
sql = mem1.cache(conn.raw_sql, ignore=['self'])
mem2 = Memory(temp_dir)
table = mem2.cache(conn.get_table, ignore=['self'])


# This dataset has the information for the book value
###################
# Compustat Block #
###################
# check the variable definitions here: https://wrds-www.wharton.upenn.edu/data-dictionary/comp_na_daily_all/
# gvkey:      global company key
# at:         assets total
# pstkl:      preferred stock - liquidating value
# txditc:     deferred taxes and investment tax credit
# pstkrv:     preferred stock redemption value
# seq:        investment securities equity
# pstk:       preferred/preference stock (capital) - total

# comp = conn.raw_sql("""
#                     select gvkey, datadate, at, pstkl, txditc,
#                     pstkrv, seq, pstk
#                     from comp.funda
#                     where indfmt='INDL' 
#                     and datafmt='STD'
#                     and popsrc='D'
#                     and consol='C'
#                     and datadate >= '01/01/1959'
#                     """, date_cols=['datadate'])
comp = sql("""
                    select gvkey, datadate, at, pstkl, txditc,
                    pstkrv, seq, pstk
                    from comp.funda
                    where indfmt='INDL' 
                    and datafmt='STD'
                    and popsrc='D'
                    and consol='C'
                    and datadate >= '01/01/1959'
                    """, date_cols=['datadate'])

comp['year']=comp['datadate'].dt.year


comp


# create preferrerd stock
# np.where: vectorised version of if condition then A else B
comp['ps']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv'])
comp['ps']=np.where(comp['ps'].isnull(),comp['pstk'], comp['ps'])
comp['ps']=np.where(comp['ps'].isnull(),0,comp['ps'])
comp['txditc']=comp['txditc'].fillna(0)


# create book equity
comp['be']=comp['seq']+comp['txditc']-comp['ps']
comp['be']=np.where(comp['be']>0, comp['be'], np.nan) # disallows negative equity


# number of years in Compustat
comp=comp.sort_values(by=['gvkey','datadate']) # sort 
comp['count']=comp.groupby(['gvkey']).cumcount() # count the periods
comp=comp[['gvkey','datadate','year','be','count']] # rearrange and keep relevant columns


# This dataset has the information on the returns
###################
# CRSP Block      #
###################
# sql similar to crspmerge macro

# variable definitions here: https://wrds-www.wharton.upenn.edu/data-dictionary/crsp_a_stock/
# msf
# ret:          returns
# retx:         returns without dividends
# shrout:       shares outstanding
# prc:          price
# msenames
# shrcd:        share code
# exchcd:       exchange code
# crsp_m = conn.raw_sql("""
#                       select a.permno, a.permco, a.date, b.shrcd, b.exchcd,
#                       a.ret, a.retx, a.shrout, a.prc
#                       from crsp.msf as a
#                       left join crsp.msenames as b
#                       on a.permno=b.permno
#                       and b.namedt<=a.date
#                       and a.date<=b.nameendt
#                       where a.date between '01/01/1959' and '12/31/2017'
#                       and b.exchcd between 1 and 3
#                       """, date_cols=['date']) 
crsp_m = sql("""
                      select a.permno, a.permco, a.date, b.shrcd, b.exchcd,
                      a.ret, a.retx, a.shrout, a.prc
                      from crsp.msf as a
                      left join crsp.msenames as b
                      on a.permno=b.permno
                      and b.namedt<=a.date
                      and a.date<=b.nameendt
                      where a.date between '01/01/1959' and '12/31/2017'
                      and b.exchcd between 1 and 3
                      """, date_cols=['date']) 

# change variable format to int
crsp_m[['permco','permno','shrcd','exchcd']]=crsp_m[['permco','permno','shrcd','exchcd']].astype(int)

# Line up date to be end of month
crsp_m['jdate']=crsp_m['date']+MonthEnd(0)


# add delisting return

# dlret:        delisting return
# dlstdt:       delisting date
# dlret = conn.raw_sql("""
#                      select permno, dlret, dlstdt 
#                      from crsp.msedelist
#                      """, date_cols=['dlstdt'])

dlret = sql("""
                     select permno, dlret, dlstdt 
                     from crsp.msedelist
                     """, date_cols=['dlstdt'])


dlret.permno=dlret.permno.astype(int)
#dlret['dlstdt']=pd.to_datetime(dlret['dlstdt'])
dlret['jdate']=dlret['dlstdt']+MonthEnd(0)

crsp = pd.merge(crsp_m, dlret, how='left',on=['permno','jdate']) # merging
crsp['dlret']=crsp['dlret'].fillna(0)
crsp['ret']=crsp['ret'].fillna(0)

# retadj factors in the delisting returns
crsp['retadj']=(1+crsp['ret'])*(1+crsp['dlret'])-1

#---> Here the market equity os defined
# calculate market equity
crsp['me']=crsp['prc'].abs()*crsp['shrout'] 
crsp=crsp.drop(['dlret','dlstdt','prc','shrout'], axis=1)
crsp=crsp.sort_values(by=['jdate','permco','me'])


### Aggregate Market Cap ###
# sum of me across different permno belonging to same permco a given date
crsp_summe = crsp.groupby(['jdate','permco'])['me'].sum().reset_index() # operations by group

# largest mktcap within a permco/date
crsp_maxme = crsp.groupby(['jdate','permco'])['me'].max().reset_index()

# join by jdate/maxme to find the permno
crsp1=pd.merge(crsp, crsp_maxme, how='inner', on=['jdate','permco','me'])

# drop me column and replace with the sum me
crsp1=crsp1.drop(['me'], axis=1)

# join with sum of me to get the correct market cap info
crsp2=pd.merge(crsp1, crsp_summe, how='inner', on=['jdate','permco'])

# sort by permno and date and also drop duplicates
crsp2=crsp2.sort_values(by=['permno','jdate']).drop_duplicates()


# keep December market cap
crsp2['year']=crsp2['jdate'].dt.year
crsp2['month']=crsp2['jdate'].dt.month
decme=crsp2[crsp2['month']==12] #---> Only keeps the Decembers in the sample
decme=decme[['permno','date','jdate','me','year']].rename(columns={'me':'dec_me'})

### July to June dates
crsp2['ffdate']=crsp2['jdate']+MonthEnd(-6)
crsp2['ffyear']=crsp2['ffdate'].dt.year
crsp2['ffmonth']=crsp2['ffdate'].dt.month
crsp2['1+retx']=1+crsp2['retx']
crsp2=crsp2.sort_values(by=['permno','date'])


# cumret by stock
# ---> tranformation of column
crsp2['cumretx']=crsp2.groupby(['permno','ffyear'])['1+retx'].cumprod() 

# lag cumret
crsp2['lcumretx']=crsp2.groupby(['permno'])['cumretx'].shift(1)

# lag market cap
crsp2['lme']=crsp2.groupby(['permno'])['me'].shift(1)

# if first permno then use me/(1+retx) to replace the missing value
crsp2['count']=crsp2.groupby(['permno']).cumcount()
crsp2['lme']=np.where(crsp2['count']==0, crsp2['me']/crsp2['1+retx'], crsp2['lme'])

# baseline me
mebase=crsp2[crsp2['ffmonth']==1][['permno','ffyear', 'lme']].rename(columns={'lme':'mebase'})

# merge result back together
crsp3=pd.merge(crsp2, mebase, how='left', on=['permno','ffyear'])
crsp3['wt']=np.where(crsp3['ffmonth']==1, crsp3['lme'], crsp3['mebase']*crsp3['lcumretx'])

decme['year']=decme['year']+1
decme=decme[['permno','year','dec_me']]


# Info as of June
crsp3_jun = crsp3[crsp3['month']==6] # ---> just filter the observations in June.

crsp_jun = pd.merge(crsp3_jun, decme, how='inner', on=['permno','year'])
crsp_jun=crsp_jun[['permno','date', 'jdate', 'shrcd','exchcd','retadj','me','wt','cumretx','mebase','lme','dec_me']]
crsp_jun=crsp_jun.sort_values(by=['permno','jdate']).drop_duplicates()


# This database contains the information in order to link with the compustat database
# which was used to compute the book value
#######################
# CCM Block           #
#######################
# variable definitions here: https://wrds-www.wharton.upenn.edu/data-dictionary/crsp_a_ccm/ccmxpf_linktable/



# ccm=conn.raw_sql("""
#                   select gvkey, lpermno as permno, linktype, linkprim, 
#                   linkdt, linkenddt
#                   from crsp.ccmxpf_linktable
#                   where substr(linktype,1,1)='L'
#                   and (linkprim ='C' or linkprim='P')
#                   """, date_cols=['linkdt', 'linkenddt'])
ccm=sql("""
                  select gvkey, lpermno as permno, linktype, linkprim, 
                  linkdt, linkenddt
                  from crsp.ccmxpf_linktable
                  where substr(linktype,1,1)='L'
                  and (linkprim ='C' or linkprim='P')
                  """, date_cols=['linkdt', 'linkenddt'])

# if linkenddt is missing then set to today date
ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today'))

ccm1=pd.merge(comp[['gvkey','datadate','be', 'count']],ccm,how='left',on=['gvkey'])
ccm1['yearend']=ccm1['datadate']+YearEnd(0)
ccm1['jdate']=ccm1['yearend']+MonthEnd(6)


# set link date bounds
ccm2=ccm1[(ccm1['jdate']>=ccm1['linkdt'])&(ccm1['jdate']<=ccm1['linkenddt'])]
ccm2=ccm2[['gvkey','permno','datadate','yearend', 'jdate','be', 'count']]

# link comp and crsp
ccm_jun=pd.merge(crsp_jun, ccm2, how='inner', on=['permno', 'jdate']) # ---> This is merging the two main datasets
ccm_jun['beme']=ccm_jun['be']*1000/ccm_jun['dec_me'] # ---> calculate book to market ratio


# select NYSE stocks for bucket breakdown
# exchcd = 1 and positive beme and positive me and shrcd in (10,11) and at least 2 years in comp
nyse=ccm_jun[(ccm_jun['exchcd']==1) & (ccm_jun['beme']>0) & (ccm_jun['me']>0) & \
             (ccm_jun['count']>=1) & ((ccm_jun['shrcd']==10) | (ccm_jun['shrcd']==11))]

# size breakdown
nyse_sz=nyse.groupby(['jdate'])['me'].median().to_frame().reset_index().rename(columns={'me':'sizemedn'}) # ---> define median

# beme breakdown
nyse_bm=nyse.groupby(['jdate'])['beme'].describe(percentiles=[0.3, 0.7]).reset_index() # ---> define value percentiles
nyse_bm=nyse_bm[['jdate','30%','70%']].rename(columns={'30%':'bm30', '70%':'bm70'})

nyse_breaks = pd.merge(nyse_sz, nyse_bm, how='inner', on=['jdate'])

# join back size and beme breakdown
ccm1_jun = pd.merge(ccm_jun, nyse_breaks, how='left', on=['jdate'])


# function to assign sz and bm bucket
def sz_bucket(row):
    if row['me']==np.nan:
        value=''
    elif row['me']<=row['sizemedn']:
        value='S'
    else:
        value='B'
    return value

def bm_bucket(row):
    if 0<=row['beme']<=row['bm30']:
        value = 'L'
    elif row['beme']<=row['bm70']:
        value='M'
    elif row['beme']>row['bm70']:
        value='H'
    else:
        value=''
    return value


# assign size portfolio # ---> assignment
ccm1_jun['szport']=np.where((ccm1_jun['beme']>0)&(ccm1_jun['me']>0)&(ccm1_jun['count']>=1), ccm1_jun.apply(sz_bucket, axis=1), '')

# assign book-to-market portfolio ---> assignment
ccm1_jun['bmport']=np.where((ccm1_jun['beme']>0)&(ccm1_jun['me']>0)&(ccm1_jun['count']>=1), ccm1_jun.apply(bm_bucket, axis=1), '')

# create positivebmeme and nonmissport variable
ccm1_jun['posbm']=np.where((ccm1_jun['beme']>0)&(ccm1_jun['me']>0)&(ccm1_jun['count']>=1), 1, 0)
ccm1_jun['nonmissport']=np.where((ccm1_jun['bmport']!=''), 1, 0)


# store portfolio assignment as of June
june=ccm1_jun[['permno','date', 'jdate', 'bmport','szport','posbm','nonmissport']]
june['ffyear']=june['jdate'].dt.year

# merge back with monthly records
crsp3 = crsp3[['date','permno','shrcd','exchcd','retadj','me','wt','cumretx','ffyear','jdate']]
ccm3=pd.merge(crsp3, 
        june[['permno','ffyear','szport','bmport','posbm','nonmissport']], how='left', on=['permno','ffyear'])

# keeping only records that meet the criteria
ccm4=ccm3[(ccm3['wt']>0)& (ccm3['posbm']==1) & (ccm3['nonmissport']==1) & 
          ((ccm3['shrcd']==10) | (ccm3['shrcd']==11))]

/tmp/ipykernel_745230/1430860703.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  june['ffyear']=june['jdate'].dt.year


############################
# Form Fama French Factors #
############################

# function to calculate value weighted return
def wavg(group, avg_name, weight_name):
    d = group[avg_name]
    w = group[weight_name]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return np.nan


# value-weigthed return
vwret=ccm4.groupby(['jdate','szport','bmport']).apply(wavg, 'retadj','wt').to_frame().reset_index().rename(columns={0: 'vwret'})
vwret['sbport']=vwret['szport']+vwret['bmport']

# firm count
vwret_n=ccm4.groupby(['jdate','szport','bmport'])['retadj'].count().reset_index().rename(columns={'retadj':'n_firms'})
vwret_n['sbport']=vwret_n['szport']+vwret_n['bmport']

# tranpose
ff_factors=vwret.pivot(index='jdate', columns='sbport', values='vwret').reset_index()
ff_nfirms=vwret_n.pivot(index='jdate', columns='sbport', values='n_firms').reset_index()


# create SMB and HML factors
ff_factors['WH']=(ff_factors['BH']+ff_factors['SH'])/2
ff_factors['WL']=(ff_factors['BL']+ff_factors['SL'])/2
ff_factors['WHML'] = ff_factors['WH']-ff_factors['WL']

ff_factors['WB']=(ff_factors['BL']+ff_factors['BM']+ff_factors['BH'])/3
ff_factors['WS']=(ff_factors['SL']+ff_factors['SM']+ff_factors['SH'])/3
ff_factors['WSMB'] = ff_factors['WS']-ff_factors['WB']
ff_factors=ff_factors.rename(columns={'jdate':'date'})

# n firm count
ff_nfirms['H']=ff_nfirms['SH']+ff_nfirms['BH']
ff_nfirms['L']=ff_nfirms['SL']+ff_nfirms['BL']
ff_nfirms['HML']=ff_nfirms['H']+ff_nfirms['L']

ff_nfirms['B']=ff_nfirms['BL']+ff_nfirms['BM']+ff_nfirms['BH']
ff_nfirms['S']=ff_nfirms['SL']+ff_nfirms['SM']+ff_nfirms['SH']
ff_nfirms['SMB']=ff_nfirms['B']+ff_nfirms['S']
ff_nfirms['TOTAL']=ff_nfirms['SMB']
ff_nfirms=ff_nfirms.rename(columns={'jdate':'date'})


###################
# Compare With FF #
###################
# _ff = conn.get_table(library='ff', table='factors_monthly')
_ff = table(library='ff', table='factors_monthly')
_ff=_ff[['date','smb','hml']]
_ff['date']=_ff['date']+MonthEnd(0)

_ffcomp = pd.merge(_ff, ff_factors[['date','WSMB','WHML']], how='inner', on=['date'])
_ffcomp70=_ffcomp[_ffcomp['date']>='01/01/1970']
print(stats.pearsonr(_ffcomp70['smb'], _ffcomp70['WSMB']))
print(stats.pearsonr(_ffcomp70['hml'], _ffcomp70['WHML']))

(0.9959347155821368, 0.0)
(0.9821803335953425, 0.0)


_ffcomp.head(2)


_ffcomp.dtypes

date    datetime64[ns]
smb            float64
hml            float64
WSMB           float64
WHML           float64
dtype: object


_ffcomp.tail(2)


_ffcomp.date = pd.to_datetime(_ffcomp.date)
_ffcomp.set_index('date', inplace=True)
_ffcomp.head(2)


plt.figure(figsize=(16,12))
plt.suptitle('Comparison of Results', fontsize=20)

ax1 = plt.subplot(211)
ax1.set_title('SMB', fontsize=15)
ax1.set_xlim([dt.datetime(1962,6,1), dt.datetime(2017,12,31)])
ax1.plot(_ffcomp['smb'], 'r--', _ffcomp['WSMB'], 'b-')
ax1.legend(('smb','WSMB'), loc='upper right', shadow=True)

ax2 = plt.subplot(212)
ax2.set_title('HML', fontsize=15)
ax2.plot(_ffcomp['hml'], 'r--', _ffcomp['WHML'], 'b-')
ax2.set_xlim([dt.datetime(1962,6,1), dt.datetime(2017,12,31)])
ax2.legend(('hml','WHML'), loc='upper right', shadow=True)

plt.subplots_adjust(top=0.92, hspace=0.2)

plt.show()

/usr/lib/python3/dist-packages/pandas/plotting/_matplotlib/converter.py:103: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()
  warnings.warn(msg, FutureWarning)

Abstract & Documentation:
In this Webinar we will give a basic introduction into Python’s Pandas libraries. We will go through a series of applications covering topics in economics, finance and econometrics, while also utilising databases that are available to Goethe University.
*.ipynb file	Please note that .ipynb file requires a Jupyter environment, for ex. Google Colab, Jupyter lab,
YouTube video (75 mins). Sections: Essentials of Python Intro to Pandas Create Dataframes Accessing the Data Operations on Columns Combining Dataframes Operations by Groups Export Data Run a Regression QA Fama French Factors

	unit	s_adj	indic	geo\time	2022M04	2022M03	2022M02	2022M01	2021M12	2021M11	...	1996M10	1996M09	1996M08	1996M07	1996M06	1996M05	1996M04	1996M03	1996M02	1996M01
0	HICP2015	NSA	CP-HI00	AT	118.75	118.00	115.40	113.93	113.95	113.62	...	71.84	71.69	71.69	71.91	71.84	71.62	71.69	71.77	71.55	71.34
1	HICP2015	NSA	CP-HI00	BE	120.61	120.28	119.49	116.99	115.93	115.95	...	70.89	70.54	70.40	70.40	70.47	70.61	70.47	70.12	69.98	69.91
2	HICP2015	NSA	CP-HI00	BG	121.12	118.59	116.16	114.80	113.44	112.54	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	HICP2015	NSA	CP-HI00	CH	103.15	102.73	102.25	101.69	101.53	101.57	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	HICP2015	NSA	CP-HI00	CY	109.38	106.13	104.22	103.33	104.04	104.37	...	67.82	67.18	65.97	67.07	66.85	66.96	66.76	66.46	64.45	66.48
5	HICP2015	NSA	CP-HI00	CZ	129.10	126.80	124.40	122.80	117.40	117.00	...	59.80	59.60	59.50	59.40	58.90	58.40	58.10	57.60	57.30	57.00
6	HICP2015	NSA	CP-HI00	DE	116.90	116.10	113.30	112.30	111.30	111.00	...	75.80	75.90	75.90	76.00	75.80	75.70	75.60	75.70	75.60	75.10
7	HICP2015	NSA	CP-HI00	DK	111.80	109.80	109.00	108.00	106.30	106.90	...	72.10	72.00	71.60	71.50	71.60	71.70	71.50	71.30	70.80	70.40
8	HICP2015	NSA	CP-HI00	EA	115.11	114.46	111.74	110.70	110.37	109.90	...	72.00	71.93	71.80	71.84	71.84	71.83	71.66	71.54	71.29	70.97
9	HICP2015	NSA	CP-HI00	EA18	115.02	114.38	111.66	110.63	110.31	109.85	...	71.56	71.48	71.31	71.35	71.38	71.36	71.19	71.06	70.76	70.46
10	HICP2015	NSA	CP-HI00	EA19	115.11	114.46	111.74	110.70	110.37	109.90	...	71.52	71.44	71.27	71.31	71.33	71.32	71.14	71.01	70.71	70.40
11	HICP2015	NSA	CP-HI00	EE	132.59	127.31	124.02	122.39	122.81	119.02	...	46.36	46.11	45.92	46.36	46.23	45.95	45.72	44.95	44.24	42.87
12	HICP2015	NSA	CP-HI00	EL	110.55	108.43	105.63	104.65	104.87	104.35	...	62.60	62.11	60.70	60.76	61.86	61.74	61.25	60.64	58.92	59.04
13	HICP2015	NSA	CP-HI00	ES	115.21	115.51	111.15	110.26	111.13	109.87	...	65.45	65.38	65.19	64.99	64.93	64.99	64.73	64.34	64.08	63.95
14	HICP2015	NSA	CP-HI00	EU	117.32	116.36	113.65	112.67	112.06	111.54	...	70.58	70.53	70.32	70.28	70.39	70.37	70.19	70.00	69.70	69.40
15	HICP2015	NSA	CP-HI00	EU27_2020	116.83	115.87	113.18	112.20	111.59	111.07	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16	HICP2015	NSA	CP-HI00	EU28	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
17	HICP2015	NSA	CP-HI00	FI	111.98	111.54	109.73	109.10	107.60	107.72	...	71.92	71.85	71.70	71.99	71.99	71.99	71.78	71.63	71.49	71.20
18	HICP2015	NSA	CP-HI00	FR	112.78	112.26	110.49	109.51	109.34	109.09	...	75.22	75.02	74.78	74.94	75.11	75.19	74.99	74.90	74.42	74.13
19	HICP2015	NSA	CP-HI00	HR	114.94	111.93	109.80	108.76	108.31	108.18	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
20	HICP2015	NSA	CP-HI00	HU	129.03	126.90	125.64	124.26	122.51	122.20	...	33.02	32.58	32.13	32.04	31.91	31.66	30.99	30.48	30.01	29.31
21	HICP2015	NSA	CP-HI00	IE	110.30	109.30	107.10	106.10	106.50	106.00	...	69.60	69.70	69.30	68.90	69.10	68.90	68.80	68.90	68.60	68.10
22	HICP2015	NSA	CP-HI00	IS	111.35	110.31	109.29	108.22	107.99	107.62	...	43.61	43.48	43.48	43.31	43.27	43.27	43.09	42.92	42.83	42.75
23	HICP2015	NSA	CP-HI00	IT	111.70	111.30	108.70	107.80	107.80	107.30	...	68.80	68.70	68.70	68.60	68.70	68.50	68.30	68.00	67.80	67.50
24	HICP2015	NSA	CP-HI00	LT	132.40	129.93	126.87	124.68	122.37	120.76	...	58.85	58.51	58.03	57.94	57.86	57.43	57.20	56.37	55.09	53.84
25	HICP2015	NSA	CP-HI00	LU	118.29	117.22	114.98	112.30	112.11	112.56	...	65.84	65.71	65.71	65.64	65.57	65.57	65.51	65.38	65.31	65.25
26	HICP2015	NSA	CP-HI00	LV	125.09	122.24	118.39	116.44	116.46	115.99	...	47.73	47.17	46.86	47.21	47.06	46.33	46.14	45.91	45.21	44.44
27	HICP2015	NSA	CP-HI00	MK	120.68	118.00	116.01	114.68	113.11	112.82	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
28	HICP2015	NSA	CP-HI00	MT	112.52	108.46	107.08	106.36	106.33	106.38	...	64.82	65.12	64.86	64.83	64.70	64.77	64.31	61.94	61.65	61.48
29	HICP2015	NSA	CP-HI00	NL	121.16	120.70	115.58	114.61	114.09	113.02	...	69.32	69.11	68.36	68.49	68.70	69.04	69.32	69.11	68.29	67.94
30	HICP2015	NSA	CP-HI00	NO	122.70	120.90	119.90	118.50	120.50	119.50	...	70.70	70.40	70.10	70.20	70.00	69.90	69.70	69.50	69.30	69.20
31	HICP2015	NSA	CP-HI00	PL	126.10	123.90	120.40	120.40	118.60	117.60	...	48.20	47.40	46.40	46.30	46.20	45.80	45.20	44.30	43.60	43.00
32	HICP2015	NSA	CP-HI00	PT	112.07	109.49	106.74	106.25	105.97	105.96	...	67.23	67.37	67.37	67.17	67.03	67.03	66.76	66.23	66.09	65.76
33	HICP2015	NSA	CP-HI00	RO	127.00	124.05	121.68	120.42	118.76	118.15	...	3.60	3.49	3.40	3.28	3.05	3.02	2.87	2.81	2.76	2.71
34	HICP2015	NSA	CP-HI00	RS	124.40	122.70	121.70	120.40	119.50	119.00	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
35	HICP2015	NSA	CP-HI00	SE	116.78	116.12	113.95	112.98	113.71	112.15	...	76.42	76.42	75.84	76.15	76.28	76.56	76.49	76.14	75.60	75.41
36	HICP2015	NSA	CP-HI00	SI	113.63	111.30	111.73	110.48	109.83	109.71	...	46.59	46.17	45.99	46.22	46.13	46.04	45.76	45.07	44.43	43.88
37	HICP2015	NSA	CP-HI00	SK	122.06	120.43	118.36	117.37	114.15	113.96	...	44.70	44.41	43.99	43.75	43.63	43.56	43.35	43.22	43.12	43.01
38	HICP2015	NSA	CP-HI00	TR	346.79	323.33	306.58	292.52	263.29	231.83	...	1.84	1.72	1.63	1.55	1.53	1.49	1.42	1.32	1.25	1.20
39	HICP2015	NSA	CP-HI00	UK	NaN	NaN	NaN	NaN	NaN	NaN	...	69.30	69.30	68.90	68.60	69.00	68.90	68.70	68.40	68.10	67.80
40	HICP2015	NSA	CP-HI00	US	120.39	119.68	117.80	116.56	116.05	115.72	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	unit	s_adj	indic	geo\time	2022M04	2022M03	2022M02	2022M01	2021M12	2021M11	...	1983M10	1983M09	1983M08	1983M07	1983M06	1983M05	1983M04	1983M03	1983M02	1983M01
607	PC_ACT	SA	LM-UN-T-TOT	AT	NaN	4.2	4.8	4.8	4.8	5.2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
608	PC_ACT	SA	LM-UN-T-TOT	BE	NaN	5.6	5.6	5.6	5.7	5.8	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
609	PC_ACT	SA	LM-UN-T-TOT	BG	NaN	4.3	4.4	4.5	4.6	4.7	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
610	PC_ACT	SA	LM-UN-T-TOT	CH	NaN	NaN	NaN	NaN	4.5	4.7	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
611	PC_ACT	SA	LM-UN-T-TOT	CY	NaN	5.9	6.3	6.5	6.6	6.6	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
612	PC_ACT	SA	LM-UN-T-TOT	CZ	NaN	2.3	2.5	2.3	2.2	2.2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
613	PC_ACT	SA	LM-UN-T-TOT	DE	NaN	2.9	3.0	3.1	3.2	3.2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
614	PC_ACT	SA	LM-UN-T-TOT	DK	NaN	4.5	4.4	4.6	4.5	4.5	...	8.4	8.4	8.5	8.5	8.5	8.4	8.4	8.3	8.2	8.1
615	PC_ACT	SA	LM-UN-T-TOT	EA19	NaN	6.8	6.9	6.9	7.0	7.1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
616	PC_ACT	SA	LM-UN-T-TOT	EE	NaN	5.4	5.4	5.8	5.4	5.3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
617	PC_ACT	SA	LM-UN-T-TOT	EL	NaN	12.9	12.8	13.0	12.8	13.4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
618	PC_ACT	SA	LM-UN-T-TOT	ES	NaN	13.5	13.4	13.3	13.4	13.5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
619	PC_ACT	SA	LM-UN-T-TOT	EU27_2020	NaN	6.2	6.3	6.3	6.4	6.5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
620	PC_ACT	SA	LM-UN-T-TOT	FI	NaN	6.4	6.5	7.0	7.2	6.8	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
621	PC_ACT	SA	LM-UN-T-TOT	FR	NaN	7.4	7.4	7.4	7.5	7.4	...	7.8	7.7	7.6	7.5	7.4	7.2	7.0	6.7	6.7	7.1
622	PC_ACT	SA	LM-UN-T-TOT	HR	NaN	6.5	6.6	6.7	6.8	7.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
623	PC_ACT	SA	LM-UN-T-TOT	HU	NaN	3.2	3.7	3.8	3.6	3.8	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
624	PC_ACT	SA	LM-UN-T-TOT	IE	4.8	5.1	4.8	5.0	5.1	5.2	...	14.5	14.3	14.1	14.0	13.9	13.7	13.5	13.4	13.2	12.9
625	PC_ACT	SA	LM-UN-T-TOT	IS	NaN	4.3	4.4	4.4	4.5	4.6	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
626	PC_ACT	SA	LM-UN-T-TOT	IT	NaN	8.3	8.5	8.6	8.8	9.0	...	7.6	7.5	7.4	7.4	7.3	7.3	7.2	7.2	7.1	6.9
627	PC_ACT	SA	LM-UN-T-TOT	JP	NaN	2.6	2.7	2.8	2.7	2.8	...	2.6	2.7	2.8	2.6	2.6	2.7	2.7	2.6	2.7	2.7
628	PC_ACT	SA	LM-UN-T-TOT	LT	NaN	6.9	7.0	7.0	6.6	6.7	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
629	PC_ACT	SA	LM-UN-T-TOT	LU	NaN	4.5	4.6	4.7	4.9	4.9	...	3.3	3.5	3.7	3.6	3.6	3.5	3.3	3.2	3.3	3.3
630	PC_ACT	SA	LM-UN-T-TOT	LV	NaN	7.0	7.2	7.3	7.4	7.3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
631	PC_ACT	SA	LM-UN-T-TOT	MT	NaN	3.0	3.1	3.1	3.2	3.1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
632	PC_ACT	SA	LM-UN-T-TOT	NL	NaN	3.3	3.4	3.6	3.8	3.7	...	9.5	9.5	9.5	9.5	9.5	9.5	9.5	9.5	9.5	9.5
633	PC_ACT	SA	LM-UN-T-TOT	NO	NaN	NaN	3.1	3.1	3.3	3.5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
634	PC_ACT	SA	LM-UN-T-TOT	PL	NaN	3.0	3.0	3.0	3.1	3.1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
635	PC_ACT	SA	LM-UN-T-TOT	PT	NaN	5.7	5.6	5.8	5.8	6.2	...	9.3	9.4	9.4	9.2	9.0	8.8	8.5	8.3	8.1	8.1
636	PC_ACT	SA	LM-UN-T-TOT	RO	NaN	5.7	5.7	5.7	5.7	5.5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
637	PC_ACT	SA	LM-UN-T-TOT	SE	NaN	7.6	7.4	8.0	7.9	8.2	...	3.7	3.8	3.8	3.7	3.9	3.7	3.4	3.7	3.7	3.4
638	PC_ACT	SA	LM-UN-T-TOT	SI	NaN	4.0	4.1	4.2	4.4	4.6	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
639	PC_ACT	SA	LM-UN-T-TOT	SK	NaN	6.5	6.5	6.6	6.6	6.6	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
640	PC_ACT	SA	LM-UN-T-TOT	TR	NaN	NaN	NaN	NaN	NaN	11.2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
641	PC_ACT	SA	LM-UN-T-TOT	UK	NaN	NaN	NaN	NaN	NaN	NaN	...	10.8	10.9	10.9	10.9	11.0	10.9	11.0	10.8	10.8	10.7
642	PC_ACT	SA	LM-UN-T-TOT	US	NaN	3.6	3.8	4.0	3.9	4.2	...	8.8	9.2	9.5	9.4	10.1	10.1	10.2	10.3	10.4	10.4

	gvkey	datadate	at	pstkl	txditc	pstkrv	seq	pstk	year
0	001000	1961-12-31	NaN	0.0	0.000	NaN	NaN	NaN	1961
1	001000	1962-12-31	NaN	0.0	NaN	NaN	NaN	0.0	1962
2	001000	1963-12-31	NaN	0.0	0.008	0.0	0.553	0.0	1963
3	001000	1964-12-31	1.416	0.0	0.020	0.0	0.607	0.0	1964
4	001000	1965-12-31	2.310	0.0	0.000	0.0	0.491	0.0	1965
...	...	...	...	...	...	...	...	...	...
37795	347085	2021-02-28	192.079	0.0	2.769	0.0	56.357	0.0	2021
37796	351491	2019-12-31	17847.000	0.0	11.000	0.0	3007.000	0.0	2019
37797	351590	2019-12-31	62131.888	0.0	106.607	0.0	11054.622	0.0	2019
37798	351590	2020-12-31	60256.041	0.0	119.333	0.0	9914.300	0.0	2020
37799	351590	2021-12-31	62325.449	0.0	77.338	0.0	18106.225	0.0	2021

	date	smb	hml	WSMB	WHML
0	1961-07-31	-0.0184	-0.0018	NaN	NaN
1	1961-08-31	-0.0177	-0.0027	NaN	NaN

Very basics of Python¶

Python has variables:¶

Python has conditional statements:¶

Python has for loops¶

Python has functions¶

Now Pandas!¶

Take a look at the data¶

How to get to the data.¶

Functions on columns¶

Rename, Create and Transform Columns¶

Combining dataframes¶

Calculations by groups¶

Save data to external file¶

Run a regression¶

Compute the Fama French Factors¶

	country	date	year	month	hicp	unemp
0	IE	2022M04	2022	4	110.30	4.8
1	AT	2022M03	2022	3	118.00	4.2
2	BE	2022M03	2022	3	120.28	5.6
3	BG	2022M03	2022	3	118.59	4.3
4	CY	2022M03	2022	3	106.13	5.9
...	...	...	...	...	...	...
10122	NO	1996M01	1996	1	69.20	4.8
10123	PT	1996M01	1996	1	65.76	8.0
10124	SE	1996M01	1996	1	75.41	8.7
10125	SI	1996M01	1996	1	43.88	7.2
10126	UK	1996M01	1996	1	67.80	8.2

	date	smb	hml	WSMB	WHML
676	2017-11-30	-0.0056	-0.0005	-0.003746	-0.003502
677	2017-12-31	-0.0132	0.0003	-0.010692	-0.001674