%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

euk["Size (Mb)"] > 500

euk["Is large"] = euk["Size (Mb)"] > 500
euk[["Species", "Kingdom", "Is large"]].head()

# size_category.py

euk["Is large"] = euk["Size (Mb)"] > 500
euk["Size category"] = euk["Is large"].replace({True: "large", False: "small"})
euk[["Species", "Kingdom", "Is large", "Size category"]].head()

euk.groupby("Size category")["Number of genes"].median()

# size_category_plot.py

sns.catplot(
    data=euk,
    x="Kingdom",
    y="Number of genes",
    hue="Size category",
    kind="box",
    orient="v",
)
plt.title(
    "Distribution of number of genes\nfor small and large genomes in different kingdoms"
)

(
    euk
    .groupby(euk["Size (Mb)"] > 500)
    ["Number of genes"]
    .median()
)

# apply_bin.py


def large_or_small(size):
    if size > 500:
        return "large"
    else:
        return "small"


euk["Size (Mb)"].apply(large_or_small)

(
    euk
    .groupby(
        euk["Size (Mb)"].apply(large_or_small)
    )["Number of genes"]
    .median()
)

(
    euk.groupby([
        euk["Size (Mb)"].apply(large_or_small), 
        "Kingdom"
    ])
    ["Number of genes"]
    .median()
)

# raw_size_hue.py

sns.relplot(
    data=euk[euk["Size (Mb)"] < 5000].dropna(),
    x="GC%",
    y="Number of proteins",
    hue="Size (Mb)",
    palette="PuRd",
)

plt.title("GC% vs. Number proteins\n for genomes of different sizes")

# binned_size_hue.py

sns.relplot(
    data=euk[euk["Size (Mb)"] < 5000].dropna(),
    x="GC%",
    y="Number of proteins",
    hue="Size category",
    style="Size category",
    palette=["DarkBlue", "Orange"],
)
plt.title("GC% vs. Number proteins\n for genomes of different sizes")

# binned_col_plot.py

g = sns.relplot(
    data=euk[euk["Size (Mb)"] < 5000].dropna(),
    x="GC%",
    y="Number of proteins",
    col="Size category",
    aspect=0.7,
    color="darkkhaki",
)
g.fig.suptitle(
    "GC% vs. Number of proteins for genomes of different sizes", y=1.1
)

# binned_col_plot_with_hue.py

g = sns.relplot(
    data=euk[euk["Size (Mb)"] < 5000].dropna(),
    x="GC%",
    y="Number of proteins",
    col="Size category",
    aspect=0.7,
    hue="Kingdom",
    palette="Set1",
)

g.fig.suptitle(
    "GC% vs. Number of proteins for genomes of different sizes", y=1.1
)

# binned_regression_plot.py

sns.lmplot(
    data=euk[euk["Size (Mb)"] < 5000].dropna(),
    x="Size (Mb)",
    y="Number of genes",
    hue="Size category",
)
plt.title("GC% vs. Number of proteins\n for genomes of different sizes", y=1.1)

# binned_regression_cols.py

g = sns.lmplot(
    data=euk[euk["Size (Mb)"] < 5000].dropna(),
    x="Size (Mb)",
    y="Number of genes",
    col="Size category",
    scatter_kws={"color": "lightgrey"},
    line_kws={"color": "darkred"},
)

g.fig.suptitle(
    "GC% vs. Number of proteins for genomes of different sizes", y=1.1
)

sns.distplot(euk["GC%"].dropna(), kde=False, bins=10)
plt.title("Distribution of GC% for all genomes")

sns.distplot(euk["GC%"].dropna(), kde=False, bins=100)
plt.title("Distribution of GC% for all genomes")

# three_bins.py


def categorize(size):
    if size > 500:
        return "large"
    elif size > 200:
        return "medium"
    else:
        return "small"


euk["Size category"] = euk["Size (Mb)"].apply(categorize)

# three_bin_plot.py

g = sns.catplot(
    data=euk,
    x="Size category",
    y="Number of proteins",
    col="Kingdom",
    kind="bar",
    order=["small", "medium", "large"],
    height=2.5,
    orient="v",
)
g.fig.suptitle("Mean number of proteins for genomes of different sizes", y=1.1)

euk.groupby("Size category")["Number of genes"].median()

pd.cut(euk["GC%"], bins=5).head()

# cut_bins.py

euk["GC category"] = pd.cut(euk["GC%"], bins=5)
euk.groupby("GC category").size()

# cut_bins_plot.py

g = sns.relplot(
    data=euk[euk["Size (Mb)"] < 5000],
    x="Size (Mb)",
    y="Number of proteins",
    col="GC category",
    height=2.5,
    color="teal",
)

g.fig.suptitle(
    "Genome size vs. number of proteins for genomes with varying GC contents",
    y=1.1,
)

my_bins = [0, 20, 40, 60, 80, 100]
euk["GC category"] = pd.cut(euk["GC%"], bins=my_bins)
euk.groupby("GC category").size()

# custom_bins.py

my_bins = range(0, 101, 20)
euk["GC category"] = pd.cut(euk["GC%"], bins=my_bins)
euk.groupby("GC category").size()

# unqual_bins.py

my_bins = [0, 40, 50, 53, 58, 100]
euk["GC category"] = pd.cut(euk["GC%"], bins=my_bins)
euk.groupby("GC category").size()

# custom_labels.py

my_bins = [0, 40, 50, 53, 58, 100]
my_labels = ["very low", "low", "medium", "high", "very high"]
euk["GC category"] = pd.cut(euk["GC%"], bins=my_bins, labels=my_labels)
euk.groupby("GC category").size()

# custom_labels_plot.py

sns.catplot(
    data=euk.dropna()[euk.dropna()["Publication year"] == 2018],
    x="GC category",
    y="Number of genes",
    kind="swarm",
    aspect=4,
    height=3,
    orient="v",
)
plt.title(
    "Number of genes for genomes published in 2018 in with varying GC percentages"
)

pd.qcut(euk["GC%"], q=5)

# quantile_bins_plot.py

my_labels = ["very low", "low", "medium", "high", "very high"]
euk["GC category"] = pd.qcut(euk["GC%"], q=5, labels=my_labels)

sns.catplot(
    data=euk.dropna()[euk.dropna()["Publication year"] == 2018],
    x="GC category",
    y="Number of genes",
    kind="swarm",
    aspect=4,
    height=3,
    orient="v",
)
plt.title(
    "Number of genes for genomes published in 2018 in with varying GC percentages"
)

pd.cut(euk["Size (Mb)"], bins=10).value_counts().sort_index()

euk["Size category"] = pd.cut(euk["Size (Mb)"], bins=10)

sns.catplot(
    data=euk, y="Size category", kind="count", color="orchid", aspect=2
)
plt.title("Number of genomes in each size category")

no_large = euk[euk["Size (Mb)"] < 5000]
pd.cut(no_large["Size (Mb)"], bins=10).value_counts().sort_index()

pd.cut(np.log(euk["Size (Mb)"]), bins=10).value_counts().sort_index()

# logarithmic_bins.py

my_bins = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096,
    8192, 16384, 32768, 65536]

pd.cut(euk["Size (Mb)"], bins=my_bins).value_counts().sort_index()

# log_bins_plot.py

euk["Size category (Mb)"] = pd.cut(euk["Size (Mb)"], bins=my_bins)
sns.catplot(
    data=euk, y="Size category (Mb)", kind="count", color="orchid", aspect=2
)
plt.title("Number of genomes in each size category")

# easy_bin_labels.py

euk["Size category (Mb)"] = pd.cut(
    euk["Size (Mb)"], bins=my_bins, labels=my_bins[1:]
)
sns.catplot(
    data=euk, x="Size category (Mb)", kind="count", color="orchid", aspect=2
)
plt.title("Number of genomes in each size category")

print([2 ** n for n in range(17)])

# bins_summary.py

summary = (
    euk.groupby(["Kingdom", "Size category (Mb)"])
    .size()
    .to_frame("Number of genomes")
    .reset_index()
)
summary.head()

# bins_summary_plot.py

g = sns.catplot(
    data=summary,
    x="Size category (Mb)",
    y="Number of genomes",
    col="Kingdom",
    kind="bar",
    height=2.5,
    color="grey",
)
g.set_xticklabels(rotation=90)
g.fig.suptitle(
    "Number of genomes in each size category for different kingdoms", y=1.1
)

# normalized_bins_summary_plot.py

summary["Proportion of genomes"] = (
    summary
    .groupby("Kingdom")
    ["Number of genomes"]
    .transform(lambda x: x / sum(x))
)

g = sns.catplot(
    data=summary,
    x="Size category (Mb)",
    y="Proportion of genomes",
    col="Kingdom",
    kind="bar",
    height=2.5,
    color="grey",
)
g.set_xticklabels(rotation=90)
g.fig.suptitle(
    "Proportion of genomes in each size category for different kingdoms", y=1.1
)

# line_bins_plot.py

sns.catplot(
    data=summary,
    x="Size category (Mb)",
    y="Proportion of genomes",
    hue="Kingdom",
    palette="Accent",
    kind="point",
    aspect=4,
    height=3,
)

plt.title("Proportion of genomes in each size category per kingdom")

sns.relplot(
    data=euk[euk["Class"] == "Birds"].dropna(),
    x="Number of genes",
    y="Number of proteins",
    size="Size (Mb)",
    aspect=2,
)

plt.title("Number of genes vs. number of proteins for bird genomes")

# bins_for_size.py

my_bins = [0, 800, 1200, 1400, 1600]
my_labels = ["<800", "800-1200", "1200-1400", ">1400"]
my_sizes = {"<800": 10, "800-1200": 30, "1200-1400": 60, ">1400": 120}

euk["Size category (Mb)"] = pd.cut(
    euk["Size (Mb)"], bins=my_bins, labels=my_labels
)

sns.relplot(
    data=euk[euk["Class"] == "Birds"].dropna(),
    x="Number of genes",
    y="Number of proteins",
    size="Size category (Mb)",
    sizes=my_sizes,
    aspect=2,
)

plt.title("Number of genes vs. number of proteins for bird genomes")

euk["Size category"].dtype

df.loc[0, "Size category"] = "banana"

euk.groupby("Size category")["Number of genes"].median()

sns.catplot(
    data=euk[euk["Number of genes"].fillna(0) > 10000],
    x="Kingdom",
    y="GC%",
    kind="boxen",
    color="purple",
    height=4,
)

plt.title(
    "Distribution of GC% across kingdoms\nfor genomes with more than 10000 genes"
)

sns.catplot(
    data=euk[euk["Number of genes"].fillna(0) <= 10000],
    x="Kingdom",
    y="GC%",
    color="purple",
    kind="boxen",
    height=4,
)
plt.title(
    "Distribution of GC% across kingdoms\nfor genomes with 10000 genes or fewer"
)

euk.groupby("Kingdom")["GC%"].median()

my_labels = euk["Kingdom"].value_counts().index.to_list()
my_labels

from pandas.api.types import CategoricalDtype

euk["Kingdom"].astype(CategoricalDtype(categories=my_labels, ordered=True))

# column_to_category.py

my_labels = euk["Kingdom"].value_counts().index.to_list()

euk["Kingdom"] = euk["Kingdom"].astype(
    CategoricalDtype(categories=my_labels, ordered=True)
)

euk.groupby("Kingdom")["GC%"].mean()

# plot_category.py

sns.catplot(
    data=euk.dropna()[euk.dropna()["Number of genes"] <= 10000],
    x="Kingdom",
    y="GC%",
    color="purple",
    kind="boxen",
    height=4,
)
plt.title(
    "Distribution of GC% across kingdoms\nfor genomes with 10000 genes or fewer"
)