%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

animals = euk[euk["Kingdom"] == "Animals"].dropna()

animals_and_plants = euk[
    (euk["Kingdom"].isin(["Animals", "Plants"])) & (euk["Size (Mb)"] < 5000)
].dropna()

sns.relplot(
    data=animals, y="Class", x="Number of genes", aspect=2,
)
plt.title("Number of genes for animal genomes in different classes")

# strip_plot.py

sns.catplot(
    data=animals, y="Class", x="Number of genes", aspect=2,
)
plt.title("Number of genes for animal genomes in different classes")

# jitter.py

sns.catplot(
    data=animals[animals["Assembly status"] == "Chromosome"],
    y="Class",
    x="Number of genes",
    aspect=2,
    jitter=0.3,
)
plt.title("Number of genes for scaffold animal genomes\n in different classes")

# grid_strip_plot.py

g = sns.catplot(
    data=animals_and_plants,
    x="Publication year",
    y="Size (Mb)",
    aspect=2,
    hue="Assembly status",
    row="Kingdom",
)

g.fig.suptitle(
    "Genome size for plants and animals in different assembly status", y=1.05
)

# ordered_grid_strip_plot.py

g = sns.catplot(
    data=animals_and_plants,  # workaround for https://github.com/mwaskom/seaborn/issues/1761
    x="Publication year",
    y="Size (Mb)",
    aspect=2,
    hue="Assembly status",
    hue_order=["Contig", "Scaffold", "Chromosome", "Complete Genome"],
    row="Kingdom",
)

g.fig.suptitle(
    "Genome size for plants and animals in different assembly status", y=1.05
)

# taxonomy_plot.py

g = sns.catplot(
    data=animals_and_plants,
    x="Publication year",
    y="Size (Mb)",
    aspect=2,
    hue="Class",
    row="Kingdom",
    palette="Set2",
)

g.fig.suptitle("Genome size for plants and animals colored by class", y=1.05)

# swarm_plot.py

sns.catplot(
    data=animals[animals["Assembly status"] == "Chromosome"],
    y="Class",
    x="Number of genes",
    aspect=2,
    kind="swarm",
)
plt.title("Number of genes for scaffold animal genomes in different classes")

# hue_swarm_plot.py

sns.catplot(
    data=euk[euk["Class"].isin(["Roundworms", "Flatworms"])],
    x="Class",
    y="GC%",
    height=6,
    kind="swarm",
    hue="Assembly status",
    aspect=0.7,
    hue_order=["Contig", "Scaffold", "Chromosome", "Complete Genome"],
)

plt.title("GC% of roundworms and flatworms\nin different assembly status")

# grid_swarm_plot.py

g = sns.catplot(
    data=euk[euk["Class"].isin(["Mammals", "Birds"])],
    x="Class",
    y="Size (Mb)",
    height=2.5,
    kind="swarm",
    col="Publication year",
    col_wrap=5,
)
g.fig.suptitle(
    "Sizes of genomes for mammals and birds published in each year", y=1.05
)

(
    euk[
        (euk["Class"] == "Mammals") 
        & (euk["Publication year"] == 2018)
    ]
    ["Species"]
    .value_counts()
)

sns.catplot(data=euk, x="Kingdom", y="GC%")

plt.title("Distribution of GC% across kingdoms")

# box_plot.py

sns.catplot(data=euk, x="Kingdom", y="GC%", kind="box")
plt.title("Distribution of GC% across kingdoms")

data = euk[
    (euk["Kingdom"].isin(["Plants", "Animals"]))
    & euk["Publication year"].isin([2017, 2018])
]

# col_boxplot.py

data = euk[
    (euk["Kingdom"].isin(["Plants", "Animals"]))
    & euk["Publication year"].isin([2017, 2018])
]

g = sns.catplot(
    data=data,
    x="Kingdom",
    y="GC%",
    kind="box",
    hue="Assembly status",
    col="Publication year",
)

g.fig.suptitle(
    "Distibution of GC% in plant and animal gnomes published in 2017 and 2018",
    y=1.05,
)

g = sns.catplot(
    data=data,
    x="Kingdom",
    y="GC%",
    kind="box",
    hue="Assembly status",
    col="Publication year",
    hue_order=["Contig", "Scaffold", "Chromosome", "Complete Genome"],
)

g.fig.suptitle(
    "Distibution of GC% in plant and animal gnomes published in 2017 and 2018",
    y=1.05,
)

# wide_boxplot.py

sns.catplot(
    data=euk,
    x="Class",
    y="GC%",
    kind="box",
    aspect=3,
    color="orange",
    height=4,
)

plt.xticks(rotation=90)
plt.title("Distribution of GC% across genomes in each class")

sns.catplot(data=euk, x="Kingdom", y="GC%", kind="box")
plt.title("Distribution of GC% across kingdoms")

# violin_plot.py

sns.catplot(data=euk, x="Kingdom", y="GC%", kind="violin")
plt.title("Distribution of GC% across kingdoms")

sns.catplot(data=euk, x="Kingdom", y="GC%", kind="violin", cut=0)
plt.title("Distribution of GC% across kingdoms")

sns.catplot(data=euk, x="Kingdom", y="GC%", kind="violin", cut=0, bw=0.1)
plt.title("Distribution of GC% across kingdoms")

sns.catplot(
    data=euk, x="Kingdom", y="GC%", kind="violin", cut=0, scale="count"
)
plt.title("Distribution of GC% across kingdoms")

# hue_violin_plot.py

g = sns.catplot(
    data=euk_float[
        euk_float["Assembly status"].isin(["Scaffold", "Chromosome"])
    ],
    x="Kingdom",
    y="Number of genes",
    kind="violin",
    cut=0,
    hue="Assembly status",
    aspect=3,
    height=3,
)

plt.title("Distribution of number of genes across genomes from each kingdom")

# split_violin_plot.py

sns.catplot(
    data=euk_float[
        euk_float["Assembly status"].isin(["Scaffold", "Chromosome"])
    ],
    x="Kingdom",
    y="Number of genes",
    kind="violin",
    cut=0,
    hue="Assembly status",
    aspect=2,
    split=True,
)
plt.title("Distribution of number of genes across genomes from each kingdom")

sns.catplot(
    data=euk,
    x="Class",
    y="GC%",
    kind="violin",
    aspect=3,
    color="orange",
    height=4,
)

plt.xticks(rotation=90)
plt.title("Distribution of GC% across genomes from each class")

# boxen_plot.py

sns.catplot(data=euk, x="Kingdom", y="GC%", kind="boxen")
plt.title("Distribution of GC% across genomes from each kingdom")

# wide_boxen_plot.py

sns.catplot(
    data=euk, x="Class", y="GC%", kind="boxen", aspect=3, color="orange"
)

plt.xticks(rotation=90)
plt.title("Distribution of GC% across genomes from each class")

# bar_plot.py

sns.catplot(
    data=euk_float,
    y="Class",
    x="Number of genes",
    kind="bar",
    aspect=1,
    color="darkgreen",
)

plt.title(
    "Mean and 95% CI of number of genes\nfor genomes in different classes"
)
plt.xticks(rotation=90)

sns.catplot(
    data=euk_float,
    y="Class",
    x="Number of genes",
    kind="bar",
    aspect=1,
    color="darkgreen",
    estimator=np.median,
    ci=None,
)

plt.title("Median number of genes\nfor genomes in different classes")
plt.xticks(rotation=90)

# hue_bar_plot.py

sns.catplot(
    data=euk_float,
    x="Publication year",
    y="Number of genes",
    kind="bar",
    hue="Kingdom",
    ci=None,
    aspect=3,
    height=4,
)

plt.title("Mean number of genes in genomes published in each year")

sns.catplot(
    data=euk_float,
    y="Class",
    x="GC%",
    kind="bar",
    aspect=1,
    color="lightgreen",
    ci="sd",
    height=6,
)

plt.title(
    "Mean and standard deviation of GC percentage\nfor genomes in different classes"
)
plt.xticks(rotation=90)

# point_plot.py

sns.catplot(
    data=euk_float,
    y="Class",
    x="GC%",
    kind="point",
    aspect=1,
    color="lightgreen",
    ci="sd",
    join=False,
)

plt.title(
    "Mean and standard deviation of GC percentage\nfor genomes in different classes"
)
plt.xticks(rotation=90)

# hue_point_plot.py

sns.catplot(
    data=euk[
        (euk["Publication year"] > 2000)
        & (euk["Kingdom"].isin(["Plants", "Animals", "Fungi"]))
    ],
    x="Publication year",
    y="Number of genes",
    kind="point",
    hue="Kingdom",
    aspect=3,
    height=4,
    orient="v",
)

plt.title("Mean number of genes in genomes published in each year")

sns.catplot(
    data=euk[
        (euk["Publication year"] > 2000)
        & (euk["Kingdom"].isin(["Plants", "Animals", "Fungi"]))
    ],
    x="Publication year",
    y="Number of genes",
    kind="point",
    hue="Kingdom",
    aspect=3,
    height=4,
    dodge=True,
    orient="v",
)

plt.title("Mean number of genes in genomes published in each year")

(
    euk[
        (euk["Publication year"] == 2003)
        & (euk["Kingdom"].isin(["Animals", "Plants"]))
    ][["Species", "Kingdom", "Number of genes"]]
)

# slope_plot.py

sns.catplot(
    data=euk[
        (euk["Assembly status"].isin(["Contig", "Chromosome"]))
        & (euk["Kingdom"].isin(["Plants", "Animals", "Protists"]))
    ],
    x="Assembly status",
    y="Number of proteins",
    kind="point",
    hue="Kingdom",
    dodge=True,
    palette="Set2",
    order=["Contig", "Chromosome"],
    orient="v",
)

plt.title(
    "Mean and 95% CI of number of proteins \nfor genomes in different assembly states"
)

# compare_temp.py

sns.catplot(
    data=weather[weather["City"].isin(["London", "Edinburgh"])],
    x="City",
    y="Mean temperature",
    kind="bar",
    ci="sd",
)
plt.title(
    "Mean and standard deviation\n of daily temperature in London and Edinburgh"
)

# ttest_temp.py

from scipy.stats import ttest_ind

edinburgh_temps = weather[weather["City"] == "Edinburgh"]["Mean temperature"]
london_temps = weather[weather["City"] == "London"]["Mean temperature"]

ttest_ind(edinburgh_temps, london_temps)

# ttest_day.py

from scipy.stats import ttest_ind

edinburgh_temps = weather[
    (weather["Day of year"] == 200) & (weather["City"] == "Edinburgh")
]["Mean temperature"]

london_temps = weather[
    (weather["Day of year"] == 200) & (weather["City"] == "London")
]["Mean temperature"]

ttest_ind(edinburgh_temps, london_temps)

# mwu_test.py

from scipy.stats import mannwhitneyu

# do plant and animals have different genome sizes?
mannwhitneyu(
    euk[euk["Kingdom"] == "Animals"]["Size (Mb)"],
    euk[euk["Kingdom"] == "Plants"]["Size (Mb)"],
)

weather.head()

sns.catplot(
    data=weather[weather["City"].isin(["Berlin", "Edinburgh"])],
    x="Day of year",
    y="Mean temperature",
    hue="City",
    aspect=3,
    height=4,
)
plt.title(
    "Mean daily temperature for each day of the year\nsince 1960 in Berlin and Edinburgh"
)

sns.catplot(
    data=weather[weather["City"].isin(["Berlin", "Edinburgh"])],
    x="Day of year",
    y="Mean temperature",
    hue="City",
    aspect=3,
    height=4,
    kind="point",
    ci="sd",
)
plt.title(
    "Mean daily temperature for each day of the year\nsince 1960 in Berlin and Edinburgh"
)

# line_plot.py

sns.relplot(
    data=weather[weather["City"].isin(["Berlin", "Edinburgh"])],
    x="Day of year",
    y="Mean temperature",
    hue="City",
    aspect=3,
    height=4,
    kind="line",
    ci="sd",
)

plt.title(
    "Mean daily temperature for each day of the year\nsince 1960 in Berlin and Edinburgh"
)

# count_plot.py

sns.catplot(
    data=euk,
    x="Publication year",
    kind="count",
    aspect=3,
    height=4,
    hue="Assembly status",
)

plt.title(
    "Total number of genomes sequenced in different assembly status over time"
)