Spark Funds is an asset management company. It wants to make investments in a few companies. The CEO of Spark Funds wants to understand the global trends in investments so that she can take the investment decisions effectively.
Spark Funds has few minor constraints for investments:
It wants to invest between 5 to 15 million USD per round of investment
It wants to invest only in English-speaking countries because of the ease of communication with the companies it would invest in.
It wants to choose one investment type from the 4 major types (Seed, Angel, Venture, Private Equity) for each potential investment they make.
What is the strategy?
Spark Funds wants to invest where most other investors are investing. This pattern is often observed among early stage startup investors.
Where did we get the data from?
We have taken real investment data from crunchbase.com.
Datasets:
Name: companies.csv
Desc: A table with basic data of companies (Company details)
Name: rounds2.csv
Desc: Contains details about companies funding per round.
Name: mapping.csv
Desc: This file maps the numerous category names in the companies table (such 3D printing, aerospace, agriculture, etc.) to eight broad sector names. The purpose is to simplify the analysis into eight sector buckets, rather than trying to analyse hundreds of them.
The objective is to identify the best sectors, countries, and a suitable investment type for making investments. The overall strategy is to invest where others are investing, implying that the 'best' sectors and countries are the ones 'where most investors are investing'.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.mode.chained_assignment = None # default='warn'
On observation we see that file is encoded using multiple encoding types (maybe because the ‘company_permalink’ column contains names of companies in various countries, and hence various languages).
# encoding specified to get rid of encoding issues for specific rows
companies = pd.read_csv("\\companies.txt", sep='\t', encoding="ISO-8859-1")
rounds2 = pd.read_csv("\\rounds2.csv", encoding="ISO-8859-1")
sect_map = pd.read_csv("\\mapping.csv")
companies.head()
companies.info()
companies.describe()
# Nan values for every column in Companies dataset
companies.isna().sum()
company_permalink column has upper/lower case entries. Should standardize it to lower case.
rounds2.info()
# Nan values for every column in rounds2 dataset
rounds2.isna().sum()
companies.permalink = companies.permalink.str.encode('utf-8').str.decode('ascii', 'ignore') # Removing problematic characters
companies.permalink = companies.permalink.apply(lambda x: x.lower()) # To lower
print(f"There are {companies.permalink.nunique()} unique rows in companies table.")
rounds2.company_permalink = rounds2.company_permalink.str.encode('utf-8').str.decode('ascii', 'ignore') # Removing problematic characters
rounds2.company_permalink = rounds2.company_permalink.apply(lambda x: x.lower()) # To lower
print(f"There are {companies.permalink.nunique()} unique rows in rounds2 table.")
def check_for_equality(a, b):
"""
Checks if all values between two Series/Dataframes are same.
Returns disimilliar values if they or not.
"""
seta = set(a.flatten())
setb = set(b.flatten())
print(f"A: {len(seta)}, B: {len(setb)}, Intersection: {len(seta.intersection(setb))}")
if len(seta - setb) > 0:
print(seta - setb)
print(setb - seta)
else: print("They are the same")
check_for_equality(companies.permalink.unique(), rounds2.company_permalink.unique())
Few names have unnecesary spaces
# Fixing spaces issue
companies.permalink = companies.permalink.str.replace(' ', '')
rounds2.company_permalink = rounds2.company_permalink.str.replace(' ', '')
check_for_equality(companies.permalink.unique(), rounds2.company_permalink.unique())
# Setting respective primary keys as index for both dataframes
companies.set_index('permalink', inplace=True)
rounds2.set_index('company_permalink', inplace=True)
# Creating Primary Key
master_frame = rounds2.join(companies)
print(f"There are {len(master_frame)} observations in master_frame.")
master_frame.head()
# Frequency of every type of funding
master_frame.funding_round_type.value_counts()
# We can drop all rows which are not of type venture, seed, angel, private_equity
master_frame = master_frame[master_frame.funding_round_type.isin(['venture', 'seed', 'angel', 'private_equity'])]
round_types = master_frame.funding_round_type.value_counts()
# Create a pieplot
plt.pie(round_types.values, labels=round_types.index, radius=1.5, wedgeprops = { 'linewidth' : 6, 'edgecolor' : 'white' })
# add a circle at the center
my_circle=plt.Circle( (0,0), 1, color='white')
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.show()
print(round_types)
# Average funding amount for venture type
venture_mean = master_frame[master_frame['funding_round_type']=='venture'].raised_amount_usd.mean()
print(f"Average funding amount raised: {round(venture_mean, 2)}")
# Average funding amount for angel type
angel_mean = master_frame[master_frame['funding_round_type']=='angel'].raised_amount_usd.mean()
print(f"Average funding amount raised: {round(angel_mean, 2)}")
# Average funding amount for seed type
seed_mean = master_frame[master_frame['funding_round_type']=='seed'].raised_amount_usd.mean()
print(f"Average funding amount raised: {round(seed_mean, 2)}")
# Average funding amount for private equity type
equity_mean = master_frame[master_frame['funding_round_type']=='private_equity'].raised_amount_usd.mean()
print(f"Average funding amount raised: {round(equity_mean, 2)}")
plt.figure(figsize=(20, 16))
ax1 = plt.subplot(4, 4, 1)
ax2 = plt.subplot(4, 4, 3)
sns.barplot(x=round_types.index, y=[venture_mean, seed_mean, angel_mean, equity_mean], ax=ax1)
sns.barplot(x=round_types.index[1:3], y=[seed_mean, angel_mean], ax=ax2)
plt.show()
# Finding ideal investment type
inv_dict = {'venture': venture_mean, 'angel': angel_mean, 'seed': seed_mean, 'equity': equity_mean}
for inv_type in inv_dict:
if inv_dict[inv_type] > 5000000 and inv_dict[inv_type] < 15000000:
print(f"Ideal investment type for Spark Funds is: {inv_type}")
# top 9 countries with the highest venture type funding
top9 = master_frame[master_frame['funding_round_type']=='venture'].groupby('country_code').sum().sort_values('raised_amount_usd', ascending=False).head(9)
plt.figure(figsize=(12, 8))
sns.barplot(x=top9.index, y=top9.values.ravel())
plt.show()
print(top9, '\n')
print("Top 3 english speaking countries in order are: USA, GBR, IND")
# Generating a Primary Sector Column
master_frame['primary_sector'] = 0
master_frame['primary_sector'] = master_frame['category_list'].str.split('|', expand=True)
# correcting improperly decoded words from mapping.csv
sect_map.reset_index(inplace=True)
sect_map['category_list'] = sect_map['category_list'].str.replace('0', 'na')
# set category_list as index column for mapping df
sect_map.set_index('category_list', inplace=True)
sect_map.drop('index', axis=1, inplace=True)
# Generate dictionary of [primary_sec: main_sec] pairs
sector_dict = {idx: next(k for k in sect_map if row[k]==1)
for idx, row in sect_map.iterrows()}
# map the main sectors using sector_dictionary
master_frame['main_sector'] = master_frame['primary_sector'].map(sector_dict)
# Drop all rows which couldn't get mapped (and we drop blanks as there are only 8 sectors)
master_frame.dropna(subset=['main_sector'], inplace=True)
master_frame = master_frame[master_frame['main_sector'] != 'Blanks']
sectors = master_frame.main_sector.value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=sectors.values, y=sectors.index)
plt.show()
print(sectors)
master_frame[['name','primary_sector', 'main_sector']].head(10)
We will choose to ignore any existing Nan values instead of dropping them altogether (As it changes certain properties such as entry counts)
# Aggregate of amount raised per round type
amount_per_type = master_frame.groupby('funding_round_type').agg({'raised_amount_usd': 'sum'})
plt.figure(figsize=(20, 16))
ax1 = plt.subplot(4, 4, 1)
ax2 = plt.subplot(4, 4, 3)
sns.barplot(x=amount_per_type.index, y=amount_per_type.values.ravel(), ax=ax1)
sns.barplot(x=amount_per_type.index[:3], y=amount_per_type.values.ravel()[:3], ax=ax2)
plt.show()
print(amount_per_type)
Creating the three country-wise data frame filtered by venture type funding
condition_1 = (master_frame['funding_round_type']=='venture')
condition_2 = ( master_frame['raised_amount_usd'].isin(np.arange(5000000, 15000000 + 1)) )
condition = (condition_1 & condition_2)
c1 = master_frame[condition & (master_frame['country_code']=='USA')]
c2 = master_frame[condition & (master_frame['country_code']=='GBR')]
c3 = master_frame[condition & (master_frame['country_code']=='IND')]
# number of entries overall per sector in master_frame (all countries)
dic = master_frame['main_sector'].value_counts()
dic
def get_count_and_inv(df):
'''
Function that accepts dataframe and generates:
1) Column for total number (or count) of investments for each main sector.
2) Column for total amount invested in each main sector
3) Values for all questions in table 5.1
'''
dic = df['main_sector'].value_counts().to_dict()
df['main_sector_count'] = df['main_sector'].map(dic).copy()
dic = df[['main_sector', 'raised_amount_usd']].groupby(df['main_sector']).sum()['raised_amount_usd'].to_dict()
df['main_sector_investment'] = df['main_sector'].map(dic).copy()
sect_name = list(df.groupby('main_sector').count().sort_values('funding_round_permalink', ascending=False).head(3).index)
sect_inv = list(df.groupby('main_sector').count().sort_values('funding_round_permalink', ascending=False).head(3)['funding_round_permalink'])
top_sec_comp = list(df[df['main_sector'] == sect_name[0]].groupby('name').sum().sort_values('raised_amount_usd', ascending=False).head(1).reset_index()['name'])[0]
second_sec_comp = list(df[df['main_sector'] == sect_name[1]].groupby('name').sum().sort_values('raised_amount_usd', ascending=False).head(1).reset_index()['name'])[0]
print(f'''
For {c}:
Total number of investments: {df.funding_round_permalink.count()}
Total amount of investments: {df.raised_amount_usd.sum()}
Top sector name: {sect_name[0]}
Second sector name: {sect_name[1]}
Third sector name: {sect_name[2]}
Number of investments in top sector: {sect_inv[0]}
Number of investments in second sector: {sect_inv[1]}
Number of investments in third sector: {sect_inv[2]}
Company that recieved highest investment in top sector was: {top_sec_comp}
Company that recieved highest investment second sector was: {second_sec_comp}
''')
for df, c in zip((c1, c2, c3), ('USA', 'GBR', 'IND')):
get_count_and_inv(df)
c1.head()
c2.head()
c3.head()
master_frame.to_csv("\\master_frame.csv")