%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

euk.groupby("Kingdom").size()

# missing_groups.py

# the only genomes bigger than 2Gb are plants and animals
euk[euk["Size (Mb)"] > 2000].groupby("Kingdom").size()

# make a categorical version of the kingdom column
euk["Kingdom category"] = euk["Kingdom"].astype("category")

# repeat the filter and count
euk[euk["Size (Mb)"] > 2000].groupby("Kingdom category").size()

# filter_berlin.py

# read in weather data and convert months to categories

weather = pd.read_csv("all_weather.csv")

from pandas.api.types import CategoricalDtype

months = ["January", "February", "March", "April", "May", "June", "July", "August",
    "September", "October", "November", "December"]

weather["Month"] = weather["Month"].astype(
    CategoricalDtype(categories=months, ordered=True)
)

# get just weather in Berlin during the 1960s
berlin = weather[
    (weather["City"] == "Berlin") & (weather["Year"].between(1960, 1969))
]

# filter_warm_days

warm_days = berlin[berlin["Temperature (°C)"] > 24]
warm_days.head()

# count_warm_days.py

year_count = warm_days.groupby("Year").size()
year_count

len(year_count[year_count < 4])

# plot_warm_days.py

g = sns.catplot(data=warm_days, x="Year", kind="count", color="red", aspect=2)

g.fig.suptitle(
    "Number of warm days in Berlin for each year beteween 1960 and 1969", y=1.1
)

# plot_all_warm_days.py

all_years_warm_days = weather[
    (weather["City"] == "Berlin") & (weather["Temperature (°C)"] > 24)
]

g = sns.catplot(
    data=all_years_warm_days, x="Year", kind="count", color="red", aspect=3
)

plt.xticks(rotation=45)

g.fig.suptitle(
    "Number of warm days in Berlin for each year beteween 1960 and 2018", y=1.1
)

warm_days.groupby("Year").size().reindex(range(1960, 1970))

warm_days.groupby("Year").size().reindex(range(1960, 1970)).fillna(0)

(
    warm_days.groupby("Year")
    .size()
    .reindex(range(1960, 1970))
    .fillna(0)
    .astype("Int64")
)

# year_to_category.py

berlin["Year"] = berlin["Year"].astype("category")
berlin["Year"]

# filter_with_category.py

warm_days = berlin[berlin["Temperature (°C)"] > 24]

warm_days.groupby("Year").size()

# plot_with_category.py

g = sns.catplot(data=warm_days, x="Year", kind="count", color="red", aspect=2)

g.fig.suptitle(
    "Number of warm days in Berlin for each year beteween 1960 and 1970", y=1.1
)

pd.Series([1, 2, 3, 4]).sum()

# make_missing.py

#  a series of four missing values
missing = pd.Series([pd.NA] * 4)
missing

missing.mean(), missing.max(), missing.min(), missing.std()

missing.sum(min)

# select_genomes.py

mammals = (
    euk[euk["Class"] == "Mammals"]
    .groupby("Species")
    .filter(lambda x: len(x) >= 3)
)
mammals

# sum_predicted_genes.py

mammals.groupby("Species")["Number of genes"].sum()

mammals[mammals["Species"] == "Rhinolophus ferrumequinum"][
    ["Species", "Size (Mb)", "Number of genes"]
]

(
    mammals.groupby("Species")
    .filter(lambda x: x["Number of genes"].sum() < 30000)["Species"]
    .unique()
)

(
    mammals.groupby("Species")
    .filter(lambda x: x["Number of genes"].sum() < 30000)
    .groupby("Species")["Number of genes"]
    .sum()
)

(
    mammals.groupby("Species")
    .filter(lambda x: x["Number of genes"].sum() < 30000)
    .groupby("Species")["Number of genes"]
    .count()  # count returns the number of non-missing values in a series
)

(
    mammals.dropna(subset=["Number of genes"])
    .groupby("Species")
    .filter(lambda x: x["Number of genes"].sum() < 30000)
    .groupby("Species")["Number of genes"]
    .sum()
)

(
    mammals.dropna(subset=["Number of genes"])
    .groupby("Species")
    .filter(lambda x: x["Number of genes"].fillna(0).sum(min_count=1) < 30000)[
        "Species"
    ]
    .unique()
)

# get_amphibians.py

amphibians = euk[euk["Class"] == "Amphibians"][
    ["Species", "Size (Mb)", "GC%", "Number of genes"]
]
amphibians

g = sns.relplot(data=amphibians, x="Number of genes", y="GC%", height=7)

g.fig.suptitle("GC% vs number of genes for amphibians", y=1.1)

# amphibian_scale.py

g = sns.relplot(
    data=amphibians, x="Number of genes", y="GC%", size="Size (Mb)", height=7
)

g.fig.suptitle("GC% vs number of genes for amphibians", y=1.1)

# amphibian_hue.py

g = sns.relplot(
    data=amphibians, x="Number of genes", y="GC%", hue="Size (Mb)", height=7
)

g.fig.suptitle("GC% vs number of genes for amphibians", y=1.1)

# skip_missing.py

non_missing = amphibians.dropna(subset=["Number of genes", "GC%", "Size (Mb)"])

g = sns.relplot(
    data=non_missing, x="Number of genes", y="GC%", size="Size (Mb)", height=7
)

g.fig.suptitle("GC% vs number of genes for amphibians", y=1.1)