%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

euk["Kingdom"].nunique(), euk["Class"].nunique()

euk["Species"].nunique()

euk["Species"].value_counts().value_counts().head()

for k in euk["Kingdom"].unique():
    my_kingdom = euk[euk["Kingdom"] == k]
    print(k, my_kingdom["GC%"].median())

euk.groupby("Kingdom")

# count_kingdom.py

# count how many rows for each different kingdom
euk.groupby("Kingdom").size()

euk["Kingdom"].value_counts()

euk.groupby("Publication year").size().tail()

# complex_count.py

euk.groupby(["Kingdom", "Assembly status"]).size()

euk.groupby("Kingdom")["GC%"]

# mean_kingdom_gc.py

# what's the mean GC percentage for each kingdom?
euk.groupby("Kingdom")["GC%"].mean()

# what's the median size for each status?
euk.groupby("Assembly status")["Size (Mb)"].median()

# max.py

# what's the maximum GC percentage for each combination
# of kingdom and status?
euk.groupby(["Kingdom", "Assembly status"])["GC%"].max()

# multiple_columns.py

# what's the mean number of genes and proteins for each kingdom?
euk.groupby("Kingdom")[["Number of genes", "Number of proteins"]].mean()

# class_plot.py

g = sns.relplot(
    data=euk.groupby("Class")[["Number of genes", "Number of proteins"]].median(),
    x="Number of genes",
    y="Number of proteins",
)

g.fig.suptitle("Median numbers of genes and proteins for genomes in each class", y=1.05)

# multiple_stats.py

import numpy as np

euk.groupby("Kingdom")["Size (Mb)"].agg([np.mean, np.median])

# multiple_everything.py

(
    euk
    .groupby(["Kingdom", "Assembly status"])
    ["Number of genes", "Number of proteins"]
    .agg([np.mean, np.median])
)

(
    euk
    .groupby(["Kingdom", "Assembly status"])
    ["Number of genes", "Number of proteins"]
    .agg([np.mean, np.median])
    .reset_index()
)

# summary_table.py

summary = (
    euk
    .groupby(["Kingdom", "Assembly status"])
    ["Number of genes", "Number of proteins"]
    .agg([np.mean, np.median])
    .reset_index()
)

summary.columns = [
    "Kingdom",
    "Assembly status",
    "Mean number of genes",
    "Median number of genes",
    "Mean number of proteins",
    "Median number of proteins",
]

summary.head()

(
    euk
    .groupby("Class")
    [["Number of genes", "Number of proteins"]]
    .agg([np.mean, len])
).head()

# aggregate_with_dict.py

(
    euk
    .groupby("Class")
    [["Number of genes", "Number of proteins", "Species"]]
    .agg({
            "Number of genes": np.median,
            "Number of proteins": np.median,
            "Species": len,
        })
).head()

# class_scatter_size.py

data = euk.groupby("Class")[
    ["Number of genes", "Number of proteins", "Species"]
].agg(
    {
        "Number of genes": np.median,
        "Number of proteins": np.median,
        "Species": len,
    }
)

g = sns.relplot(
    data=data,
    x="Number of genes",
    y="Number of proteins",
    size="Species",
    sizes=(10, 100),
    color="indigo",
)

g.fig.suptitle(
    "Median numbers of genes and proteins for genomes in each class", y=1.05
)

(
    euk
    .groupby("Class")
    [["Number of genes", "Number of proteins", "Species"]]
    .agg({
            "Number of genes": np.median,
            "Number of proteins": np.median,
            "Species": len,
        })
).head()

(
    euk
    .groupby("Class", as_index=False)
    [["Number of genes", "Number of proteins", "Species"]]
    .agg({
            "Number of genes": np.median,
            "Number of proteins": np.median,
            "Species": len,
        })
).head()

data = (
    euk
    .groupby("Class", as_index=False)
    [["Number of genes", "Number of proteins", "Species"]]
    .agg({
            "Number of genes": np.median,
            "Number of proteins": np.median,
            "Species": len,
        })
)

g = sns.relplot(
    data=data,
    x="Number of genes",
    y="Number of proteins",
    size="Species",
    sizes=(10, 100),
    hue="Class",
    palette="Set2",
)

# we need a bit more y space to account for the tall legend
g.fig.suptitle(
    "Median numbers of genes and proteins for genomes in each class", y=1.1
)

(
    euk.groupby(["Class", "Kingdom"], as_index=False)[
        ["Number of genes", "Number of proteins", "Species"]
    ].agg(
        {
            "Number of genes": np.median,
            "Number of proteins": np.median,
            "Species": len,
        }
    )
).head()

# class_scatter_size_kingdom_color.py

data = (
    euk
    .groupby(["Class", "Kingdom"], as_index=False)
    [["Number of genes", "Number of proteins", "Species"]]
    .agg({
            "Number of genes": np.median,
            "Number of proteins": np.median,
            "Species": len,
        })
)

g = sns.relplot(
    data=data,
    x="Number of genes",
    y="Number of proteins",
    size="Species",
    sizes=(10, 100),
    hue="Kingdom",
    palette="Set2",
)

g.fig.suptitle("Median numbers of genes and proteins for genomes in each kingdomf", y=1.05)

(
    euk[euk["Assembly status"] == "Scaffold"]
    .groupby(["Class", "Kingdom"], as_index=False)
    [["Number of genes", "Number of proteins", "Species"]]
    .agg({
            "Number of genes": np.median,
            "Number of proteins": np.median,
            "Species": len,
        })
).head()

euk.groupby("Class")["Number of genes"].mean()

euk[euk["Size (Mb)"] < 10]

def check_df(group_df):
    if len(group_df) > 1000:
        return True
    else:
        return False

euk.groupby("Class").filter(check_df)

# filter_group.py


def check_df(group_df):
    return len(group_df) > 1000


euk.groupby("Class").filter(check_df)

(euk.groupby("Class").filter(lambda x: len(x) > 1000))

# complex_group_filter.py


def species_filter(df):
    max_genes = df["Number of genes"].max()
    min_genes = df["Number of genes"].min()
    ratio = max_genes / min_genes
    return ratio > 2


# have to dropna to avoid species where number of genes is all missing
euk.dropna().groupby("Species").filter(species_filter)

(
    euk.dropna()
    .groupby("Species")
    .filter(
        lambda x: (x["Number of genes"].max() / x["Number of genes"].min()) > 2
    )
)

temp = (
    euk[euk["Assembly status"].isin(["Contig", "Scaffold"])]
    .groupby(["Kingdom", "Assembly status"])
    .size()
    .to_frame(name="Genome count")
    .reset_index()
)
temp

temp.groupby("Kingdom")["Genome count"].transform(sum)

temp["Total for kingdom"] = (
    temp
    .groupby("Kingdom")
    ["Genome count"]
    .transform(sum)
)
temp

# normalize.py

# make the summary table
temp = (
    euk[euk["Assembly status"].isin(["Contig", "Scaffold"])]
    .groupby(["Kingdom", "Assembly status"])
    .size()
    .to_frame(name="Genome count")
    .reset_index()
)

# calculate the totals
temp["Total for kingdom"] = (
    temp
    .groupby("Kingdom")
    ["Genome count"]
    .transform(sum)
)

# add a new column
temp["Proportion"] = temp["Genome count"] / temp["Total for kingdom"]
temp

sns.catplot(
    data=temp, x="Kingdom", y="Genome count", kind="bar", hue="Assembly status"
)
plt.title(
    "Number of genomes in contig and scaffold\n assembly status for each kingdom"
)

sns.catplot(
    data=temp, x="Kingdom", y="Proportion", kind="bar", hue="Assembly status"
)
plt.title(
    "Fraction of genomes in contig and scaffold\n assembly status for each kingdom"
)

def normalize(counts):
    return counts / sum(counts)


temp.groupby("Kingdom")["Genome count"].transform(normalize)

# summary_year.py

temp2 = (
    euk[euk["Publication year"].between(2001,2018)]  
    .groupby(["Publication year", "Kingdom"])["Size (Mb)"]
    .sum()
    .to_frame("Total Mb sequenced")
    .reset_index()
)
temp2

# normalize_year.py

temp2["Proportion Mb sequenced"] = (
    temp2
    .groupby("Publication year")
    ["Total Mb sequenced"]
    .transform(lambda x: x / sum(x))
)
temp2

# plot_year_raw.py

g = sns.catplot(
    data=temp2,
    y="Kingdom",
    x="Total Mb sequenced",
    col="Publication year",
    col_wrap=4,
    kind="bar",
    height=1.5,
    aspect=2,
)

g.fig.suptitle(
    "Total megabases sequenced for each kingdom in each year since 2001", y=1.1
)

# plot_year_normalized.py

g = sns.catplot(
    data=temp2,
    y="Kingdom",
    x="Proportion Mb sequenced",
    col="Publication year",
    col_wrap=4,
    kind="bar",
    height=1.5,
    aspect=2,
)
g.fig.suptitle(
    "Proportion of bases sequenced for each kingdom in each year since 2001",
    y=1.1,
)

# plot_year_normalized_line.py

sns.catplot(
    data=temp2,
    x="Publication year",
    y="Proportion Mb sequenced",
    hue="Kingdom",
    aspect=3,
    kind="point",
    height=4,
)

plt.title(
    "Proportion of bases sequenced for each kingdom in each year since 2001"
)

temp2[temp2["Kingdom"].isin(["Plants", "Animals", "Fungi"])]

(
    temp2[temp2["Kingdom"].isin(["Plants", "Animals", "Fungi"])]
    .groupby("Kingdom")
    ["Total Mb sequenced"]
    .apply(lambda x: x.iloc[0])
)

(
    temp2[temp2["Kingdom"].isin(["Plants", "Animals", "Fungi"])]
    .groupby("Kingdom")
    ["Total Mb sequenced"]
    .transform(lambda x: x / x.iloc[0])
)

# baseline.py

temp2["Increase relative to 2001"] = (
    temp2[temp2["Kingdom"].isin(["Plants", "Animals", "Fungi"])]
    .groupby("Kingdom")
    ["Total Mb sequenced"]
    .transform(lambda x: x / x.iloc[0])
)

# plot_baseline.py

sns.catplot(
    data=temp2[temp2["Kingdom"].isin(["Plants", "Animals", "Fungi"])],
    x="Publication year",
    y="Increase relative to 2001",
    hue="Kingdom",
    aspect=3,
    kind="point",
    height=4,
)

plt.title("Increase in bases sequenced relative to 2001 for each kingdom")

for label, group_df in euk.groupby("Kingdom"):
    print(label, len(group_df))

for label, group_df in euk.groupby("Kingdom"):
    filename = label + ".csv"
    group_df.to_csv(filename, index=False)

for label, group_df in euk.groupby(["Kingdom", "Assembly status"]):
    print(label)

# iterate_group_labels.py

for label, group_df in euk.groupby(["Kingdom", "Assembly status"]):
    kingdom, status = label
    filename = kingdom + "_" + status + ".csv"
    group_df.to_csv(filename, index=False)

euk.groupby("Class")["Number of genes"].median()

euk.groupby("Class")["Number of genes"].median().sort_values()

# sort_groups.py

euk.groupby("Class")["Number of genes"].median().sort_values().index

sns.catplot(
    data=euk,
    x="Class",
    y="Number of genes",
    kind="box",
    color="sandybrown",
    aspect=3,
    height=4,
    orient="v",
)
plt.xticks(rotation=45, horizontalalignment="right")
plt.title("Distribution of number of genes for genomes in each class")

# plot_sorted_groups.py

sns.catplot(
    data=euk,
    x="Class",
    y="Number of genes",
    kind="box",
    color="sandybrown",
    aspect=3,
    height=4,
    order=euk.groupby("Class")["Number of genes"].median().sort_values().index,
    orient="v",
)
plt.xticks(rotation=45, horizontalalignment="right")
plt.title("Distribution of number of genes for genomes in each class")