%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

animals = euk[euk["Kingdom"] == "Animals"].dropna()

# named_color.py

sns.relplot(
    data=animals, x="Size (Mb)", y="Number of genes", color="purple",
)
plt.title("Genome size vs number of genes\n for animal genomes")

# hex_color.py

sns.relplot(
    data=animals, x="Size (Mb)", y="Number of genes", color="#00b3c7",
)
plt.title("Genome size vs number of genes\n for animal genomes")

# rgb_color.py

sns.relplot(
    data=animals, x="Size (Mb)", y="Number of genes", color=(1, 0.5, 0),
)
plt.title("Genome size vs number of genes\n for animal genomes")

sns.relplot(
    data=animals,
    x="Size (Mb)",
    y="Number of genes",
    color="mintcream",  # too similar to white!
)
plt.title("Genome size vs number of genes\n for animal genomes")

# grey_chart.py

sns.catplot(
    data=euk, kind="box", x="Kingdom", y="GC%", color="white", height=4
)
plt.title("Distribution of GC%\n for genomes in different kingdoms", y=1.05)

fig, ax = plt.subplots()
from scipy import ndimage

i = plt.imread("colormaps_sequential_1.png")
plt.imshow(ndimage.rotate(i, -90))
ax.axis("off")

# sequential_palette.py

sns.relplot(
    data=animals,
    x="Size (Mb)",
    y="Number of genes",
    hue="GC%",
    palette="YlOrBr",
)
plt.title("Genome size vs number of genes\n for animal genomes")

sns.relplot(
    data=animals, x="Size (Mb)", y="Number of genes", hue="GC%", palette="jet",
)
plt.title("Genome size vs number of genes\n for animal genomes")

# reversed_palette.py

sns.relplot(
    data=animals,
    x="Size (Mb)",
    y="Number of genes",
    hue="Publication year",
    palette="OrRd_r",
)
plt.title("Genome size vs number of genes\n for animal genomes")

# custom_sequential_palette.py

sns.relplot(
    data=animals,
    x="Size (Mb)",
    y="Number of genes",
    hue="Publication year",
    palette=sns.light_palette("purple", as_cmap=True),
)
plt.title("Genome size vs number of genes\n for animal genomes")

sns.catplot(
    data=euk, kind="box", x="Kingdom", y="GC%", color="white", height=4
)
plt.title("Distribution of GC%\n for genomes in different kingdoms")

sns.catplot(
    data=euk, kind="box", x="Kingdom", y="GC%", palette="BuGn", height=4
)
plt.title("Distribution of GC%\n for genomes in different kingdoms")

fig, ax = plt.subplots()
from scipy import ndimage

i = plt.imread("colormaps_categorical.png")
plt.imshow(ndimage.rotate(i, -90))
ax.axis("off")

# categorical_palette.py

sns.catplot(
    data=euk, kind="box", x="Kingdom", y="GC%", palette="Set2", height=4
)
plt.title("Distribution of GC%\n for genomes in different kingdoms")

# grid_color.py

g = sns.catplot(
    data=euk[euk["Publication year"] > 2013],
    kind="box",
    x="Kingdom",
    y="GC%",
    palette="Dark2",
    height=3.5,
    col="Publication year",
    col_wrap=3,
)
g.fig.suptitle("Distribution of GC% for genomes in different kingdoms", y=1.05)

# scatter_grid.py

g = sns.relplot(
    data=euk[euk["Publication year"] > 2013],
    x="Size (Mb)",
    y="Number of genes",
    palette="Dark2",
    height=3.5,
    col="Publication year",
    col_wrap=3,
    hue="Kingdom",
)
g.fig.suptitle(
    "Genome size vs. number of genes for genomes in different kingdoms", y=1.05
)

# custom_category_colors.py

sns.catplot(
    data=euk,
    kind="box",
    x="Kingdom",
    y="GC%",
    palette=["salmon", "gold", "seagreen", "indigo", "firebrick"],
    height=4,
)
plt.title("Distribution of GC%\n for genomes in different kingdoms")
None

sns.catplot(data=euk, kind="bar", x="Kingdom", y="Number of genes", orient="v")
plt.title("Distribution of GC%\n for genomes in different kingdoms")

sns.catplot(
    data=euk, kind="bar", x="Class", y="Number of genes", aspect=3, orient="v"
)
plt.title("Distribution of GC% for genomes in different classes")
plt.xticks(rotation=45, horizontalalignment="right")

animals["GC%"].mean()

animals["GC difference from mean"] = animals["GC%"] - animals["GC%"].mean()
animals.head()

# difference_plot.py

animals["GC difference from mean"] = animals["GC%"] - animals["GC%"].mean()

sns.relplot(data=animals, x="GC difference from mean", y="Number of genes")
plt.xlim(-15, 15)
plt.title("Number of genes vs. GC difference\n from mean for animal genomes")

sns.relplot(
    data=animals,
    x="Size (Mb)",
    y="Number of genes",
    hue="GC difference from mean",
)
plt.title("Genome size vs number of genes\n for animal genomes")

# diverging_palette.py

animals["GC difference\n from mean"] = animals["GC%"] - animals["GC%"].mean()

sns.relplot(
    data=animals,
    x="Size (Mb)",
    y="Number of genes",
    hue="GC difference\n from mean",
    palette="RdBu",
    hue_norm=(-10, 10),  # make zero the middle of the color scale
)
plt.title("Genome size vs number of genes\n for animal genomes")

fig, ax = plt.subplots()
from scipy import ndimage

i = plt.imread("colormaps_diverging.png")
plt.imshow(ndimage.rotate(i, -90))
ax.axis("off")

g = sns.catplot(
    data=euk[euk["Publication year"].isin([2016, 2017, 2018])],
    kind="box",
    x="Kingdom",
    y="GC%",
    palette="Dark2",
    height=3.5,
    col="Publication year",
)
g.fig.suptitle("Distribution of GC% for genomes in different kingdoms", y=1.05)
None

g = sns.catplot(
    data=euk[euk["Publication year"].isin([2016, 2017, 2018])],
    kind="box",
    x="Kingdom",
    y="GC%",
    color="lightskyblue",
    height=3.5,
    col="Publication year",
)
g.fig.suptitle("Distribution of GC% for genomes in different kingdoms", y=1.05)
None

# color_and_marker.py

sns.relplot(
    data=euk[euk["Class"].isin(["Mammals", "Birds"])].dropna(),
    x="Size (Mb)",
    y="Number of genes",
    hue="Class",
    palette="Set1",
    style="Class",
)
plt.title("Genome size vs number of genes\n for mammal and bird genomes")

# color_and_size.py

sns.relplot(
    data=euk[euk["Class"].isin(["Birds"])].dropna(),
    y="Number of proteins",
    x="Number of genes",
    hue="Size (Mb)",
    palette="YlOrBr",
    size="Size (Mb)",
    sizes=(2, 100),
)
plt.title("Number of genes and number of proteins\n for bird genomes")

# color_and_style.py

sns.catplot(
    data=euk[
        (euk["Publication year"] > 2000)
        & (euk["Kingdom"].isin(["Plants", "Fungi"]))
    ],
    x="Publication year",
    y="Number of genes",
    kind="point",
    hue="Kingdom",
    linestyles=["-", "--"],
    aspect=3,
    height=4,
    dodge=True,
    orient="v",
)

plt.title(
    "Mean number of genes for plant and fungal genomes\nsequenced each year since 2001",
)

# color_and_style_line.py

sns.relplot(
    data=weather[weather["City"].isin(["Berlin", "Edinburgh"])],
    x="Day of year",
    y="Mean temperature",
    hue="City",
    aspect=3,
    height=4,
    kind="line",
    style="City",
    ci="sd",
)

plt.title(
    "Mean daily temperature for each day of the year\nsince 1960 in Berlin and Edinburgh"
)

sns.catplot(
    data=euk,
    kind="bar",
    x="Class",
    y="Number of genes",
    aspect=2,
    color="lightblue",
    orient="v",
)
plt.title("Mean number of genes\n for genomes in different classes")
plt.xticks(rotation=45, horizontalalignment="right")

sns.catplot(
    data=euk,
    kind="bar",
    x="Class",
    y="Number of genes",
    aspect=2,
    hue="Kingdom",
    orient="v",
)
plt.title("Mean number of genes\n for genomes in different classes")
plt.xticks(rotation=45, horizontalalignment="right")

# color_as_category.py

sns.catplot(
    data=euk,
    kind="bar",
    x="Class",
    y="Number of genes",
    aspect=2,
    hue="Kingdom",
    dodge=False,
    orient="v",
)
plt.title("Mean number of genes\n for genomes in different classes")
plt.xticks(rotation=45, horizontalalignment="right")
None

# ordered_labels.py

labels_order = [
    "Insects", "Other Animals", "Mammals", "Roundworms", "Birds", "Fishes", "Flatworms", "Reptiles",
    "Amphibians", "Basidiomycetes", "Ascomycetes", "Other Fungi", "Other", "Land Plants", 
    "Green Algae", "Other Plants", "Apicomplexans", "Kinetoplasts", "Other Protists",
]

sns.catplot(
    data=euk,
    kind="bar",
    x="Class",
    y="Number of genes",
    aspect=2,
    hue="Kingdom",
    dodge=False,
    order=labels_order,
    orient="v",
)
plt.title("Mean number of genes\n for genomes in different classes")
plt.xticks(rotation=45, horizontalalignment="right")

# sorted_labels.py

labels_order = ["Apicomplexans", "Ascomycetes", "Other Fungi", "Green Algae", "Basidiomycetes", 
                "Kinetoplasts", "Other", "Other Protists", "Insects", "Flatworms", "Birds", 
                "Roundworms", "Other Animals", "Reptiles","Other Plants", "Amphibians", 
                "Mammals", "Fishes", "Land Plants"]

sns.catplot(
    data=euk,
    kind="bar",
    x="Class",
    y="Number of genes",
    aspect=2,
    hue="Kingdom",
    dodge=False,
    order=labels_order,
    orient="v",
)
plt.title("Mean number of genes\n for genomes in different classes")
plt.xticks(rotation=45, horizontalalignment="right")

def warm_or_cold(class_name):
    if class_name == "Mammals" or class_name == "Birds":
        return "Warm-blooded"
    else:
        return "Cold-blooded"

animals["Class"].apply(warm_or_cold)

animals["Thermoregulation"] = animals["Class"].apply(warm_or_cold)
animals.head()

# color_as_metadata.py


def warm_or_cold(class_name):
    if class_name == "Mammals" or class_name == "Birds":
        return "Warm-blooded"
    else:
        return "Cold-blooded"


animals["Thermoregulation"] = animals["Class"].apply(warm_or_cold)

sns.catplot(
    data=animals,
    y="Class",
    x="Number of genes",
    kind="bar",
    hue="Thermoregulation",
    dodge=False,
    height=5,
)

plt.title("Mean and 95% CI of number of genes for each class")

# highlight_subtle.py

g = sns.catplot(
    data=euk[euk["Publication year"].isin([2016, 2017, 2018])],
    kind="box",
    x="Kingdom",
    y="GC%",
    height=4,
    col="Publication year",
    order=["Plants", "Fungi", "Animals", "Protists", "Other"],
    palette=[
        "goldenrod",
        "goldenrod",
        "saddlebrown",
        "goldenrod",
        "goldenrod",
    ],
)
g.fig.suptitle("Distribution of GC% for genomes in different kingdoms", y=1.05)

# highlight_bold.py

g = sns.catplot(
    data=euk[euk["Publication year"].isin([2016, 2017, 2018])],
    kind="box",
    x="Kingdom",
    y="GC%",
    height=4,
    col="Publication year",
    order=["Plants", "Fungi", "Animals", "Protists", "Other"],
    palette=["white", "white", "crimson", "white", "white"],
)
g.fig.suptitle("Distribution of GC% for genomes in different kingdoms", y=1.05)

sns.catplot(
    data=euk[euk["Number of genes"].fillna(0) > 10000],
    x="Kingdom",
    y="GC%",
    palette="Accent",
    kind="boxen",
)
plt.title(
    "Distribution of GC% across kingdoms\nfor genomes with more than 10000 genes"
)

sns.catplot(
    data=euk[euk["Number of genes"].fillna(0) <= 10000],
    x="Kingdom",
    y="GC%",
    palette="Accent",
    kind="boxen",
)
plt.title(
    "Distribution of GC% across kingdoms\nfor genomes with 10000 genes or fewer"
)

kingdom_order = ["Plants", "Animals", "Fungi", "Protists", "Other"]

sns.catplot(
    data=euk[euk["Number of genes"].fillna(0) > 10000],
    x="Kingdom",
    y="GC%",
    palette="Accent",
    kind="boxen",
    order=kingdom_order,
)
plt.title(
    "Distribution of GC% across kingdoms\nfor genomes with more than 10000 genes"
)

sns.catplot(
    data=euk[euk["Number of genes"].fillna(0) <= 10000],
    x="Kingdom",
    y="GC%",
    palette="Accent",
    kind="boxen",
    order=kingdom_order,
)
plt.title(
    "Distribution of GC% across kingdoms\nfor genomes with 10000 genes or fewer"
)

euk["Gene count category"] = (
    euk["Number of genes"]
    .fillna(0)
    .apply(lambda x: "More than 10,000" if x > 10000 else "10,000 or fewer")
)
euk.head()

# simple_binning.py

euk["Gene count category"] = (
    euk["Number of genes"]
    .fillna(0)
    .apply(lambda x: "More than 10,000" if x > 10000 else "10,000 or fewer")
)

g = sns.catplot(
    data=euk,
    x="Kingdom",
    y="GC%",
    palette="Accent",
    kind="boxen",
    col="Gene count category",
    col_order=["10,000 or fewer", "More than 10,000"],
)
g.fig.suptitle("Distribution of GC% across kingdoms", y=1.05)

sns.catplot(data=euk, x="Kingdom", y="GC%", kind="boxen")
plt.title("Distribution of GC% across kingdoms")

sns.relplot(
    data=euk[euk["Size (Mb)"] < 5000],
    x="Size (Mb)",
    y="Number of genes",
    hue="Assembly status",
    aspect=2,
)
plt.title(
    "Genome size vs. number of genes\nfor genomes in different stages of assembly"
)

sns.relplot(
    data=euk[euk["Size (Mb)"] < 5000],
    x="Size (Mb)",
    y="Number of genes",
    hue="Assembly status",
    aspect=2,
    palette="Set2",
)
plt.title(
    "Genome size vs. number of genes\nfor genomes in different stages of assembly"
)