%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

# what are the five animal classes with the most sequenced genomes?
euk[euk["Kingdom"] == "Animals"]["Class"].value_counts().head(5)

show_all_rows()

# reptiles_size_genes.py

# filter for class and remove outliers
selected = euk[
    (euk["Class"] == "Reptiles")
    & (euk["Number of genes"] > 1000).fillna(False)
]
selected[["Species", "Size (Mb)", "Number of genes"]]

show_some_rows()

# reptiles_size_genes_plot.py

sns.relplot(data=selected, x="Size (Mb)", y="Number of genes")

plt.title("Genome size and number of genes for reptiles", pad=20)

import pandas as pd
import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline

# need to do this again after running %matplotlib magic
import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

# gc_histogram.py

sns.distplot(euk["GC%"].dropna())

# gc_histogram_custom.py

# set the shape of the plot
plt.figure(figsize=(8, 4))

sns.distplot(
    euk["GC%"].dropna(),  # this is our series of values
    color="red",  # change the color
    bins=100,  # set the number of bins
    kde=False,  # don't calculate KDE
)

# set the x limit between 0 and 100 for a percentage
plt.xlim((0, 100))

# set a title for the chart
plt.title("Distribution of GC percentage across eukaryote genomes")

# gc_histogram_multiple.py

plt.figure(figsize=(8, 4))

# for each unique kingdom...
for kingdom in euk["Kingdom"].unique():

    # ...select just the rows for that kingdom...
    one_kingdom = euk[euk["Kingdom"] == kingdom]

    # ... and plot the GC values
    sns.distplot(one_kingdom["GC%"].dropna(), hist=False, label=kingdom)

plt.title(
    "Distribution of GC percentage for genomes\nbelonging to different kingdoms"
)

# size_genes_scatter.py

sns.relplot(data=euk, x="Size (Mb)", y="Number of genes")

plt.title("Genome size vs number of genes\n for all genomes")

# filtered_size_genes_scatter.py

sns.relplot(
    data=euk[euk["Size (Mb)"] < 5000],
    x="Size (Mb)",
    y="Number of genes"
)

plt.title("Genome size vs number of genes\n for genomes < 5000 Mb")

# hue_scatter.py

animals = (
    euk[euk["Kingdom"] == "Animals"].dropna()
)

sns.relplot(
    data=animals, 
    x="Size (Mb)", 
    y="Number of genes", 
    hue="GC%"
)

plt.title("Genome size vs number of genes\n for animal genomes")

# point_size_scatter.py

sns.relplot(
    data=animals,
    x="Number of genes",
    y="Number of proteins",
    size="Size (Mb)",
    sizes=(2, 150),
)
plt.title("Number of genes vs number of proteins\n for animal genomes")

# complex_scatter.py

sns.relplot(
    data=animals,
    x="Number of genes",
    y="Number of proteins",
    size="Size (Mb)",
    sizes=(2, 150),
    hue="Publication year",
)
plt.title("Number of genes vs number of proteins\n for animal genomes")

# gene_density_scatter.py

# create gene density column
animals["Genes per Kb"] = animals["Number of genes"] / animals["Size (Mb)"] * 1000

sns.relplot(
    data=animals, 
    x="Size (Mb)", 
    y="Genes per Kb", 
    color="purple",
)

plt.title("Genome size vs gene density\n for animal genomes")