import pandas as pd
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

missing = []
for chunk in chunk_iter:
    missing_chunk = chunk.isna().sum(axis=0)
    print(missing_chunk.values.tolist())
    missing.append(missing_chunk)

missing = pd.concat(missing)
df = pd.DataFrame(data=missing.index,columns = ['column name'])
df["counts"] = missing.values
df.groupby('column name').sum()

[0, 0, 52, 0, 53, 0, 64, 0, 0, 2557, 778, 1371, 0, 900, 0, 0, 0, 0, 0, 653]
[0, 0, 51, 0, 43, 0, 45, 0, 0, 5000, 261, 714, 0, 313, 0, 0, 0, 0, 0, 239]
[0, 0, 82, 0, 56, 0, 44, 0, 0, 5000, 271, 808, 0, 314, 0, 0, 0, 0, 0, 221]
[0, 0, 57, 0, 36, 0, 44, 0, 0, 5000, 239, 714, 0, 274, 0, 0, 0, 0, 0, 299]
[0, 0, 69, 0, 40, 0, 41, 0, 0, 5000, 352, 1116, 0, 434, 0, 0, 0, 0, 0, 243]
[0, 0, 61, 0, 39, 0, 33, 0, 0, 5000, 218, 732, 0, 238, 0, 0, 0, 0, 0, 206]
[1, 1, 88, 1, 35, 1, 42, 2, 2, 5000, 313, 922, 2, 339, 3, 3, 3, 3, 3, 287]
[0, 0, 83, 0, 65, 0, 55, 0, 0, 5000, 267, 775, 0, 334, 0, 0, 0, 0, 0, 368]
[0, 0, 75, 0, 50, 0, 50, 0, 0, 5000, 1432, 1787, 0, 1464, 0, 0, 0, 0, 0, 324]
[0, 0, 15, 0, 46, 0, 70, 0, 0, 5000, 5000, 5000, 0, 5000, 0, 0, 0, 0, 0, 469]
[0, 0, 10, 0, 29, 0, 45, 0, 0, 2870, 2870, 2870, 0, 2870, 0, 0, 0, 0, 0, 290]


chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

mem_usage = 0
for i,chunk in enumerate(chunk_iter):
    chunk_mem_usage = 0
    print(f"Chunk {i}:")
    cols = chunk.columns
    for col in cols:
        mem = chunk[col].memory_usage(deep=True)/(1024*1024)
        print(f"{col} memory usage: {round(mem,2)}MB")
        mem_usage += mem
        chunk_mem_usage += mem
    print(f"Chunk {i} memory usage: {round(chunk_mem_usage,2)}MB\n")

print(f"Total Memory usage: {round(mem_usage,2)}MB")

Chunk 0:
company_permalink memory usage: 0.37MB
company_name memory usage: 0.32MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.41MB
investor_name memory usage: 0.34MB
investor_category_code memory usage: 0.23MB
investor_country_code memory usage: 0.27MB
investor_state_code memory usage: 0.25MB
investor_region memory usage: 0.31MB
investor_city memory usage: 0.29MB
funding_round_type memory usage: 0.31MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 0 memory usage: 5.58MB

Chunk 1:
company_permalink memory usage: 0.37MB
company_name memory usage: 0.33MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.47MB
investor_name memory usage: 0.36MB
investor_category_code memory usage: 0.04MB
investor_country_code memory usage: 0.28MB
investor_state_code memory usage: 0.26MB
investor_region memory usage: 0.3MB
investor_city memory usage: 0.31MB
funding_round_type memory usage: 0.31MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 1 memory usage: 5.53MB

Chunk 2:
company_permalink memory usage: 0.37MB
company_name memory usage: 0.33MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.47MB
investor_name memory usage: 0.36MB
investor_category_code memory usage: 0.04MB
investor_country_code memory usage: 0.28MB
investor_state_code memory usage: 0.26MB
investor_region memory usage: 0.31MB
investor_city memory usage: 0.31MB
funding_round_type memory usage: 0.31MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 2 memory usage: 5.54MB

Chunk 3:
company_permalink memory usage: 0.37MB
company_name memory usage: 0.32MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.47MB
investor_name memory usage: 0.36MB
investor_category_code memory usage: 0.04MB
investor_country_code memory usage: 0.28MB
investor_state_code memory usage: 0.26MB
investor_region memory usage: 0.31MB
investor_city memory usage: 0.31MB
funding_round_type memory usage: 0.31MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 3 memory usage: 5.53MB

Chunk 4:
company_permalink memory usage: 0.37MB
company_name memory usage: 0.33MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.48MB
investor_name memory usage: 0.36MB
investor_category_code memory usage: 0.04MB
investor_country_code memory usage: 0.28MB
investor_state_code memory usage: 0.25MB
investor_region memory usage: 0.31MB
investor_city memory usage: 0.3MB
funding_round_type memory usage: 0.31MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 4 memory usage: 5.53MB

Chunk 5:
company_permalink memory usage: 0.37MB
company_name memory usage: 0.33MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.48MB
investor_name memory usage: 0.37MB
investor_category_code memory usage: 0.04MB
investor_country_code memory usage: 0.28MB
investor_state_code memory usage: 0.26MB
investor_region memory usage: 0.31MB
investor_city memory usage: 0.31MB
funding_round_type memory usage: 0.31MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 5 memory usage: 5.56MB

Chunk 6:
company_permalink memory usage: 0.37MB
company_name memory usage: 0.33MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.48MB
investor_name memory usage: 0.36MB
investor_category_code memory usage: 0.04MB
investor_country_code memory usage: 0.28MB
investor_state_code memory usage: 0.26MB
investor_region memory usage: 0.31MB
investor_city memory usage: 0.3MB
funding_round_type memory usage: 0.31MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 6 memory usage: 5.53MB

Chunk 7:
company_permalink memory usage: 0.37MB
company_name memory usage: 0.32MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.47MB
investor_name memory usage: 0.35MB
investor_category_code memory usage: 0.04MB
investor_country_code memory usage: 0.28MB
investor_state_code memory usage: 0.26MB
investor_region memory usage: 0.31MB
investor_city memory usage: 0.31MB
funding_round_type memory usage: 0.31MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 7 memory usage: 5.51MB

Chunk 8:
company_permalink memory usage: 0.37MB
company_name memory usage: 0.32MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.45MB
investor_name memory usage: 0.35MB
investor_category_code memory usage: 0.04MB
investor_country_code memory usage: 0.25MB
investor_state_code memory usage: 0.24MB
investor_region memory usage: 0.31MB
investor_city memory usage: 0.27MB
funding_round_type memory usage: 0.31MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 8 memory usage: 5.4MB

Chunk 9:
company_permalink memory usage: 0.36MB
company_name memory usage: 0.32MB
company_category_code memory usage: 0.31MB
company_country_code memory usage: 0.29MB
company_state_code memory usage: 0.28MB
company_region memory usage: 0.31MB
company_city memory usage: 0.32MB
investor_permalink memory usage: 0.37MB
investor_name memory usage: 0.33MB
investor_category_code memory usage: 0.04MB
investor_country_code memory usage: 0.04MB
investor_state_code memory usage: 0.04MB
investor_region memory usage: 0.31MB
investor_city memory usage: 0.04MB
funding_round_type memory usage: 0.3MB
funded_at memory usage: 0.32MB
funded_month memory usage: 0.31MB
funded_quarter memory usage: 0.31MB
funded_year memory usage: 0.04MB
raised_amount_usd memory usage: 0.04MB
Chunk 9 memory usage: 4.64MB

Chunk 10:
company_permalink memory usage: 0.21MB
company_name memory usage: 0.18MB
company_category_code memory usage: 0.18MB
company_country_code memory usage: 0.16MB
company_state_code memory usage: 0.16MB
company_region memory usage: 0.18MB
company_city memory usage: 0.18MB
investor_permalink memory usage: 0.21MB
investor_name memory usage: 0.19MB
investor_category_code memory usage: 0.02MB
investor_country_code memory usage: 0.02MB
investor_state_code memory usage: 0.02MB
investor_region memory usage: 0.18MB
investor_city memory usage: 0.02MB
funding_round_type memory usage: 0.17MB
funded_at memory usage: 0.18MB
funded_month memory usage: 0.18MB
funded_quarter memory usage: 0.18MB
funded_year memory usage: 0.02MB
raised_amount_usd memory usage: 0.02MB
Chunk 10 memory usage: 2.67MB

Total Memory usage: 57.02MB


chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
types = set()
print(chunk.columns.tolist())
for chunk in chunk_iter:
    print(chunk.dtypes.values.tolist())
    types = types.union(set(chunk.dtypes.values))
types

['company_permalink', 'company_name', 'company_category_code', 'company_country_code', 'company_state_code', 'company_region', 'company_city', 'investor_permalink', 'investor_name', 'investor_category_code', 'investor_country_code', 'investor_state_code', 'investor_region', 'investor_city', 'funding_round_type', 'funded_at', 'funded_month', 'funded_quarter', 'funded_year', 'raised_amount_usd']
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('O'), dtype('float64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('int64'), dtype('float64')]

{dtype('int64'), dtype('float64'), dtype('O')}


chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
types = set()
mem_usage = 0
for chunk in chunk_iter:
    chunk.drop(columns=['investor_category_code'], inplace = True)


    floats = chunk.loc[:,chunk.dtypes == 'float64']
    float_cols = floats.columns
    for col in float_cols:
        remainder = chunk[col].apply(lambda x: x % 1).sum()
        if remainder == 0:
            chunk[col] = chunk[col].astype('Int64')
            chunk[col] = pd.to_numeric(chunk[col],downcast='integer')
        else:
            chunk[col] = pd.to_numeric(chunk[col],downcast='float')


    
    ints = chunk.loc[:,chunk.dtypes == 'int64']
    int_cols = ints.columns
    for col in int_cols:
        chunk[col] = pd.to_numeric(chunk[col],downcast='integer')
    
    mem_usage += chunk.memory_usage(deep=True).sum()/(1024**2)


print(f"Total Memory Usage: {mem_usage}MB")

Total Memory Usage: 55.84935283660889MB


objs= chunk.loc[:,chunk.dtypes == 'O']
objs.head(5)


drop_cols = ['investor_category_code','company_permalink','investor_permalink','funded_at','funded_quarter']
alter_cols = ['funded_month']


chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

mem_usage = 0
for chunk in chunk_iter:

    chunk.drop(columns=drop_cols, inplace=True)
    chunk['funded_year'] = chunk['funded_month'].str[0:4].astype('Int16')
    chunk['funded_month'] = chunk['funded_month'].str[-2:].astype('Int8')


    floats = chunk.loc[:,chunk.dtypes == 'float64']
    float_cols = floats.columns
    for col in float_cols:
        remainder = chunk[col].apply(lambda x: x % 1).sum()
        if remainder == 0:
            chunk[col] = chunk[col].astype('Int64')
            chunk[col] = pd.to_numeric(chunk[col],downcast='integer')
        else:
            chunk[col] = pd.to_numeric(chunk[col],downcast='float')

    
    ints = chunk.loc[:,chunk.dtypes == 'int64']
    int_cols = ints.columns
    for col in int_cols:
        chunk[col] = pd.to_numeric(chunk[col],downcast='integer')
    
    mem_usage += chunk.memory_usage(deep=True).sum()/(1024**2)

print(f"Total memory usage: {round(mem_usage,2)}MB")

Total memory usage: 37.54MB


import sqlite3
conn = sqlite3.connect('crunchbase.db')

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
for chunk in chunk_iter:

    chunk.drop(columns=drop_cols, inplace=True)
    chunk['funded_year'] = chunk['funded_month'].str[0:4].astype('Int16')
    chunk['funded_month'] = chunk['funded_month'].str[-2:].astype('Int8')


    floats = chunk.loc[:,chunk.dtypes == 'float64']
    float_cols = floats.columns
    for col in float_cols:
        remainder = chunk[col].apply(lambda x: x % 1).sum()
        if remainder == 0:
            chunk[col] = chunk[col].astype('Int64')
            chunk[col] = pd.to_numeric(chunk[col],downcast='integer')
        else:
            chunk[col] = pd.to_numeric(chunk[col],downcast='float')

    
    ints = chunk.loc[:,chunk.dtypes == 'int64']
    int_cols = ints.columns
    for col in int_cols:
        chunk[col] = pd.to_numeric(chunk[col],downcast='integer')
    

    chunk.to_sql("investments", conn, if_exists='append', index=False)


query = """
PRAGMA table_info(investments);
"""
result = pd.read_sql(query,conn)
result


q = 'SELECT company_name, SUM(raised_amount_usd) AS total FROM investments GROUP BY company_name ORDER BY total DESC;'
result = pd.read_sql(q, conn)
print(f"{result.memory_usage(deep = True).sum()/(1024*1024)} MB \n")


total = result['total'].sum()
num_companies = round((result.shape[0]/100)*10)

percentage = 100*(result[0:num_companies]['total'].sum()/total)
print(f"Top 10% of companies account for {round(percentage,2)}% of total investment funds")  

num_companies = round((result.shape[0]/100)*1)
percentage = 100*(result[0:num_companies]['total'].sum()/total)
print(f"Top 1% of companies account for {round(percentage,2)} of total investment funds")

0.8442239761352539 MB 

Top 10% of companies account for 67.13% of total investment funds
Top 1% of companies account for 26.31 of total investment funds


q = 'SELECT company_category_code, SUM(raised_amount_usd) AS total FROM investments GROUP BY company_category_code ORDER BY total DESC LIMIT 1;'
result = pd.read_sql(q, conn)
print(f"{result.memory_usage(deep = True).sum()/(1024*1024)} MB \n")

print('Company category attracting the most investments: ' + result["company_category_code"].values[0])

total = result["total"].values[0]
print(f"Total biotech investments: ${total}")

0.00019073486328125 MB 

Company category attracting the most investments: biotech
Total biotech investments: $220792846124


q = 'SELECT investor_name, COUNT(investor_name) AS num_startups, SUM(raised_amount_usd) AS total FROM investments GROUP BY investor_name HAVING num_startups > 5 ORDER BY total DESC;'
result = pd.read_sql(q, conn)
print(f"{result.memory_usage(deep = True).sum()/(1024*1024)} MB \n")

print('Biggest investor: ' + result["investor_name"].values[0])
amount = result["total"].values[0]
print(f"Total amount contributed: ${amount}\n")

result["investment per startup"] = result["total"]/result["num_startups"]

most_per = result.loc[result["investment per startup"].max() == result["investment per startup"],:]
print('Biggest investor per startup: ' + most_per["investor_name"].values[0])
amount = most_per["total"].values[0]
print(f"Amount contributed per startup: ${amount}\n")
print("(among investors contributing to at least 5 startups)")

0.238922119140625 MB 

Biggest investor: Kleiner Perkins Caufield & Byers
Total amount contributed: $22435652752.0

Biggest investor per startup: Sprint Nextel
Amount contributed per startup: $5000000000.0

(among investors contributing to at least 5 startups)


q = 'SELECT funding_round_type,SUM(raised_amount_usd) AS total, COUNT(*) as num_rounds FROM investments GROUP BY funding_round_type ORDER BY total DESC;'
result = pd.read_sql(q, conn)
print(f"{result.memory_usage(deep = True).sum()/(1024*1024)} MB \n")
result["funds per round"] = result["total"]/result["num_rounds"]

most_freq = result.loc[result["num_rounds"].max() == result["num_rounds"],:]["funding_round_type"]
print(f"Most frequent type of funding round: {most_freq.values[0]}")

most_total = result.loc[0,"funding_round_type"]
total = result.loc[0,"total"]
print(f"funding round contributing highest total: {most_total}. Total of ${round(total)}")

funds_per = result.loc[result["funds per round"].max() == result["funds per round"],:]["funding_round_type"]
print(f"Highest funds per round: {funds_per.values[0]}")

0.0008592605590820312 MB 

Most frequent type of funding round: series-a
funding round contributing highest total: series-c+. Total of $531506928414
Highest funds per round: post-ipo

	company_permalink	company_name	company_category_code	company_country_code	company_state_code	company_region	company_city	investor_permalink	investor_name	investor_region	funding_round_type	funded_at	funded_month	funded_quarter
50000	/company/nuorder	NuORDER	fashion	USA	CA	Los Angeles	West Hollywood	/person/mortimer-singer	Mortimer Singer	unknown	series-a	2012-10-01	2012-10	2012-Q4
50001	/company/chacha	ChaCha	advertising	USA	IN	Indianapolis	Carmel	/person/morton-meyerson	Morton Meyerson	unknown	series-b	2007-10-01	2007-10	2007-Q4
50002	/company/binfire	Binfire	software	USA	FL	Bocat Raton	Bocat Raton	/person/moshe-ariel	Moshe Ariel	unknown	angel	2008-04-18	2008-04	2008-Q2
50003	/company/binfire	Binfire	software	USA	FL	Bocat Raton	Bocat Raton	/person/moshe-ariel	Moshe Ariel	unknown	angel	2010-01-01	2010-01	2010-Q1
50004	/company/unified-color	Unified Color	software	USA	CA	SF Bay	South San Frnacisco	/person/mr-andrew-oung	Mr. Andrew Oung	unknown	angel	2010-01-01	2010-01	2010-Q1

	cid	name	type	dflt_value
0	0	company_name	TEXT	None
1	1	company_category_code	TEXT	None
2	2	company_country_code	TEXT	None
3	3	company_state_code	TEXT	None
4	4	company_region	TEXT	None
5	5	company_city	TEXT	None
6	6	investor_name	TEXT	None
7	7	investor_country_code	TEXT	None
8	8	investor_state_code	TEXT	None
9	9	investor_region	TEXT	None
10	10	investor_city	TEXT	None
11	11	funding_round_type	TEXT	None
12	12	funded_month	INTEGER	None
13	13	funded_year	INTEGER	None
14	14	raised_amount_usd	INTEGER	None

Big Data Workflow: Augmenting pandas with SQLite¶

Summary¶

About the Data¶

Chunk Processing with Pandas¶

Saving the DataFrame into a database using SQLite3¶

Answering Questions with Memory Constraints¶

Q1: What percentage of total investment funds do the top 10% of companies account for? What about the top 1%?¶

Q2: Which company category attracted the most investments? How much did this category total in investments?¶

Q3: Which investor contributed the most? Which investor contributed the most per startup (among those investing in at least 5 startups) ?¶

Q6: Which type of funding round occured most frequently? Which funding round resulted in the highest investment total? Which funding type attracted the most investments per round of funding?¶

Conclusion¶

	counts
column name
company_category_code	643
company_city	533
company_country_code	1
company_name	1
company_permalink	1
company_region	1
company_state_code	492
funded_at	3
funded_month	3
funded_quarter	3
funded_year	3
funding_round_type	3
investor_category_code	50427
investor_city	12480
investor_country_code	12001
investor_name	2
investor_permalink	2
investor_region	2
investor_state_code	16809
raised_amount_usd	3599