%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

# read_weather.py

weather = pd.read_csv("weather.csv")

from pandas.api.types import CategoricalDtype

months = ["January", "February", "March", "April", "May", "June", "July",
    "August", "September", "October", "November", "December"]

weather["Month"] = weather["Month"].astype(
    CategoricalDtype(categories=months, ordered=True)
)

weather

# quick_filter.py

weather[weather["City"] == "Berlin"].groupby("Year")["Mean temperature"].max()

# slow_filter.py

maxes = {}

for year in weather["Year"].unique():
    this_year = weather[weather["Year"] == year]
    just_berlin = this_year[this_year["City"] == "Berlin"]
    year_max = just_berlin["Mean temperature"].max()
    maxes[year] = year_max

%%timeit
# fast_broadcast.py

sizes = euk["Size (Mb)"] * 1_000_000

%%timeit
# slow_loop.py

sizes = []
for size in euk["Size (Mb)"]:
    sizes.append(size * 1_000_000)

%%timeit
# fast_np.py

np.log(euk["Size (Mb)"])

%%timeit
# slow_log.py

logs = []
for size in euk["Size (Mb)"]:
    logs.append(np.log(size))

for x in euk:
    print(x)

for index, row in euk.iterrows():
    pass

%%timeit
# slow_row_iteration.py

densities = []
for index, row in euk.iterrows():

    # calculate number of genes per megabase
    gene_density = row["Number of genes"] / row["Size (Mb)"]

    densities.append(gene_density)

%%timeit

euk["Number of genes"] / euk["Size (Mb)"]

import calendar

calendar.isleap(1992)

%%timeit
# leap_with_loop.py

leaps = []
for year in euk["Publication year"]:
    leaps.append(calendar.isleap(year))

%%timeit
# leap_with_rows.py

leaps = []
for index, row in euk.iterrows():
    leaps.append(calendar.isleap(row["Publication year"]))

%%timeit
# leap_with_apply.py

euk["Publication year"].apply(calendar.isleap)

# check_prime.py


def is_prime_number(x):
    x = int(x)
    if x >= 2:
        for y in range(2, x):
            if not (x % y):
                return False
    else:
        return False
    return True

# drop_missing.py

both_present = euk.dropna(subset=["Number of genes", "Number of proteins"])

%%timeit
# prime_rows.py

result = []
for index, row in both_present.iterrows():
    result.append(
        is_prime_number(row["Number of genes"])
        and is_prime_number(row["Number of genes"])
    )

%%timeit
# prime_apply.py

both_present.apply(
    lambda row: is_prime_number(row["Number of genes"])
    and is_prime_number(row["Number of proteins"]),
    axis=1,
)

%%timeit
# prime_lambda.py
(
    both_present["Number of genes"].apply(is_prime_number) 
    & both_present["Number of proteins"].apply(is_prime_number)
)

# count_papers.py

from urllib.request import urlopen
import re
import time


def count_papers(classname):
    link = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={classname.replace(' ', '+')}&rettype=count"
    f = urlopen(link)
    myfile = f.read()
    count = re.search(r"<Count>(\d+)</Count>", str(myfile)).group(1)
    time.sleep(1)
    return int(count)

count_papers("Ascomycetes")

%%timeit
# count_papers_loop.py

counts = []
for classname in euk.head()["Class"]:
    counts.append(count_papers(classname))

%%timeit
# count_papers_apply.py

euk.head()["Class"].apply(count_papers)

(7 / 5) * len(euk)

euk["Class"].nunique()

# build_dict.py

count_dict = {}
for classname in euk["Class"].unique():
    count_dict[classname] = count_papers(classname)

print(count_dict)

%%timeit

euk["Class"].replace(count_dict)

%%timeit

euk["Class"].apply(count_dict.get)

# cache_decorator.py

from functools import lru_cache


@lru_cache(maxsize=None)
def count_papers(classname):
    link = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={classname.replace(' ', '+')}&rettype=count"
    f = urlopen(link)
    myfile = f.read()
    count = re.search(r"<Count>(\d+)</Count>", str(myfile)).group(1)
    time.sleep(1)
    return int(count)


euk["Class"].apply(count_papers)

euk["Species"].head().apply(count_papers)

# random_samply.py

euk["Species"].sample(10).apply(count_papers)

# read_weather.py

all_weather = pd.read_csv("all_weather.csv")
all_weather

# kde_grid_weather.py

# first lay out the grid
g = sns.FacetGrid(
    data=all_weather[all_weather["Rainfall (mm)"] < 100],
    col="City",
    aspect=1,
    height=4,
)

# then plot the charts
g.map(
    sns.kdeplot,
    "Temperature (°C)",
    "Rainfall (mm)",
    shade=True,
    shade_lowest=False,
    cmap="OrRd",
)

g.fig.suptitle(
    "Density plot of temperature vs. rainfall for three cities", y=1.05
)

# kde_grid_head.py

# first lay out the grid
g = sns.FacetGrid(
    data=all_weather[all_weather["Rainfall (mm)"] < 100].head(1000),
    col="City",
    aspect=1,
    height=4,
)

# then plot the charts
g.map(
    sns.kdeplot,
    "Temperature (°C)",
    "Rainfall (mm)",
    shade=True,
    shade_lowest=False,
    cmap="OrRd",
)

g.fig.suptitle(
    "Density plot of temperature vs. rainfall for three cities", y=1.05
)

# kde_grid_sample.py

# first lay out the grid
g = sns.FacetGrid(
    data=all_weather[all_weather["Rainfall (mm)"] < 100].sample(1000),
    col="City",
    aspect=1,
    height=4,
)

# then plot the charts
g.map(
    sns.kdeplot,
    "Temperature (°C)",
    "Rainfall (mm)",
    shade=True,
    shade_lowest=False,
    cmap="OrRd",
)

g.fig.suptitle(
    "Density plot of temperature vs. rainfall for three cities", y=1.05
)

%%timeit

weather[weather["City"] == "Berlin"]

weather["City category"] = weather["City"].astype("category")

%%timeit

weather[weather["City category"] == "Berlin"]

# city_index_weather.py

weather = weather.set_index("City").sort_index()
weather

%%timeit

weather.loc["Berlin"]

weather = pd.read_csv("weather.csv")

%%timeit
# weather_complex_filter.py

weather[
    (weather["City"] == "Berlin")
    & (weather["Year"] == 1989)
    & (weather["Month"] == "November")
    & (weather["Day of month"] == 9)
]

# weather_complex_index.py

weather = weather.set_index(["City", "Year", "Month", "Day of month"])

%%timeit
# weather_filter_index.py

weather.loc["Berlin", 1989, "November", 9]

# small_large_weather.py

small_weather = pd.read_csv(
    "weather.csv",
    nrows=6000,
    index_col=["City", "Year", "Month", "Day of month"],
)
large_weather = pd.read_csv(
    "weather.csv",
    nrows=60000,
    index_col=["City", "Year", "Month", "Day of month"],
)

%%timeit

small_weather.loc["Berlin", 1965, "November", 9]

%%timeit

large_weather.loc["Berlin", 1965, "November", 9]