%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

summary = euk.groupby(["Kingdom", "Assembly status"])["GC%"].mean().unstack()
summary

# heatmap.py

summary = euk.groupby(["Kingdom", "Assembly status"])["GC%"].mean().unstack()

sns.heatmap(summary)
plt.title(
    "Mean GC% for genomes belonging to each kingdom\nin each assembly status"
)

# styled_heatmap.py

summary = euk.groupby(["Kingdom", "Assembly status"])["GC%"].mean().unstack()

plt.figure(figsize=(3, 3))
sns.heatmap(summary, cmap="Blues", vmin=0, vmax=100, annot=True)
plt.title(
    "Mean GC% for genomes \nbelonging to each kingdom\nin each assembly status"
)

# mb_summary.py

mb_per_year = (
    euk.groupby(["Class", "Publication year"])["Size (Mb)"]
    .sum()
    .apply(lambda x: x / 1000)  # divide each sum by 1000 to get Gb
    .unstack()
)
mb_per_year

# mb_summary_heatmap.py

plt.figure(figsize=(15, 5))
sns.heatmap(
    mb_per_year,
    cmap="GnBu",
    cbar_kws={"shrink": 0.5, "label": "Gb sequenced"},
    linewidths=1,
)
plt.title("Gigabases of genome sequence published for each class in each year")

# filled_summary.py

mb_per_year = (
    euk.groupby(["Class", "Publication year"])["Size (Mb)"]
    .sum()
    .apply(lambda x: x / 1000)
    .unstack()
    .fillna(0)
)
mb_per_year

# filled_summary_heatmap.py

plt.figure(figsize=(15, 5))
sns.heatmap(
    mb_per_year,
    cmap="GnBu",
    cbar_kws={"shrink": 0.5, "label": "Gb sequenced"},
    linewidths=1,
)
plt.title("Gigabases of genome sequence published for each class in each year")

weather

# berlin_summary.py

berlin_summary = (
    weather[weather["City"] == "Berlin"]
    .groupby(["Year", "Day of year"])["Mean temperature"]
    .first()
    .unstack()
)
berlin_summary

# temp_heatmap.py

plt.figure(figsize=(15, 5))

sns.heatmap(berlin_summary, cmap="hot")

plt.title(
    "A literal heatmap: mean daily temperature (°C) in Berlin since 1960"
)

# diverging_heatmap.py

plt.figure(figsize=(15, 5))

sns.heatmap(
    berlin_summary, cmap="RdBu_r", center=0  # reversed so that red is highest
)

plt.title("A literal heatmap: mean daily temperature in Berlin since 1960")

# city_summary.py

city_summary = (
    weather.groupby(["Year", "Day of year", "City"])["Mean temperature"]
    .first()
    .unstack()
)
city_summary

difference = city_summary["London"] - city_summary["Edinburgh"]
difference

difference.unstack()

# difference_heatmap.py

plt.figure(figsize=(15, 5))

sns.heatmap(difference.unstack(), cmap="GnBu", center=0)

plt.title("Difference in daily mean temperature between London and Edinburgh")

# diverging_difference.py

plt.figure(figsize=(15, 5))

sns.heatmap(difference.unstack(), cmap="RdBu_r", center=0)

plt.title("Difference in daily mean temperature between London and Edinburgh")

# daily_summary.py

ten_years = (
    weather[weather["Year"].between(1980, 1990)]
    .groupby(["City", "Day of year", "Year"])["Mean temperature"]
    .first()
    .unstack()
    .unstack()
)
ten_years

# daily_summary_plot.py

plt.figure(figsize=(15, 2))

sns.heatmap(ten_years, cmap="hot")

plt.title("Mean daily temperature in three cities between 1980 and 1990")

plt.figure(figsize=(8, 4))

# for each unique kingdom...
for kingdom in euk["Kingdom"].unique():

    # ...select just the rows for that kingdom...
    one_kingdom = euk[euk["Kingdom"] == kingdom]

    # ... and plot the GC values
    sns.distplot(one_kingdom["GC%"].dropna(), hist=False, label=kingdom)

plt.title(
    "Distribution of GC percentage for genomes\nbelonging to different kingdoms"
)

sns.catplot(data=euk, x="Kingdom", y="GC%", kind="violin")

plt.title(
    "Distribution of GC percentage for genomes\nbelonging to different kingdoms"
)

pd.cut(euk["GC%"], bins=range(0, 100, 5))

# gc_counts_summary.py

summary = (
    euk.groupby(
        [
            "Kingdom",
            pd.cut(euk["GC%"], bins=range(0, 100, 5), labels=range(5, 100, 5)),
        ]
    )
    .size()
    .unstack()
)
summary

# gc_counts_heatmap.py

plt.figure(figsize=(15, 5))

sns.heatmap(summary, cmap="PuRd", annot=True)

plt.title("Number of genomes in each GC range per kingdom")

# annotated_heatmap.py

plt.figure(figsize=(15, 5))

sns.heatmap(summary, cmap="PuRd", annot=True, fmt=".0f")

plt.title("Number of genomes in each GC range per kingdom")

# more_bins.py

my_bins = range(12, 75, 2)
my_labels = range(14, 75, 2)

summary = (
    euk.groupby(
        ["Kingdom", pd.cut(euk["GC%"], bins=my_bins, labels=my_labels)]
    )
    .size()
    .unstack()
)

plt.figure(figsize=(15, 5))
sns.heatmap(summary, cmap="PuRd", square=True)
plt.title("Number of genomes in each GC range per kingdom")

summary.fillna(0).apply(lambda x: x / sum(x), axis=1)

# normalized_heatmap.py

plt.figure(figsize=(20, 5))

sns.heatmap(
    summary.fillna(0).apply(lambda x: x / sum(x), axis=1),
    cmap="PuRd",
    square=True,
)

plt.title("Proprtion of genomes in each GC range per kingdom")

# median_summary.py

my_bins = range(12, 75, 2)
my_labels = range(14, 75, 2)

genes_summary = (
    euk.groupby(
        ["Kingdom", pd.cut(euk["GC%"], bins=my_bins, labels=my_labels)]
    )["Number of genes"]
    .median()
    .unstack()
)
genes_summary

# median_heatmap.py

plt.figure(figsize=(15, 5))

sns.heatmap(genes_summary, cmap="YlOrBr", square=True)

plt.title("Median number of genes for genomes in each GC range per genome")

# divergence_from_mean_heatmap.py

plt.figure(figsize=(15, 5))

sns.heatmap(
    genes_summary.apply(lambda x: x / x.mean(), axis=1),
    cmap="PuOr",
    square=True,
    center=1,
)

plt.title("Normalized number of genes for genomes in each GC range per genome")

# count_summary.py

my_bins = range(12, 75, 2)
my_labels = range(14, 75, 2)

count_summary = (
    euk
    .groupby([
        "Kingdom", 
        pd.cut(euk["GC%"], bins=my_bins, labels=my_labels)
    ])
    .size()
    .unstack()
)
count_summary

# annotated_with_count.py

plt.figure(figsize=(17, 5))

sns.heatmap(
    genes_summary.apply(lambda x: x / x.mean(), axis=1),
    cmap="PuOr",
    square=True,
    center=1,
    annot=count_summary,
    fmt=".0f",
)

plt.title("Normalized number of genes for genomes in each GC range per genome")

# classify_cells.py


def classify_cell(value):
    if value < 10:
        return "*"
    else:
        return ""


annotation_table = count_summary.applymap(classify_cell)
annotation_table

# very_custom_annotation.py

plt.figure(figsize=(15, 5))

sns.heatmap(
    genes_summary.apply(lambda x: x / x.mean(), axis=1),
    cmap="PuOr",
    square=True,
    center=1,
    annot=annotation_table,
    fmt="s",
)

plt.title(
    "Normalized number of genes for genomes in each GC range per genome\n * indicates <10 genomes"
)

# edinburgh_summary.py

# first set the month order
from pandas.api.types import CategoricalDtype

months = ["January", "February", "March", "April", "May", "June", "July", "August",
    "September", "October", "November", "December"]

weather["Month"] = weather["Month"].astype(
    CategoricalDtype(categories=months, ordered=True)
)

edinburgh_summary = (
    weather[weather["City"] == "Edinburgh"]
    .groupby(["Year", "Month"])["Mean temperature"]
    .mean()
    .unstack()
)
edinburgh_summary

# edinburgh_summary_heatmap.py

plt.figure(figsize=(10, 10))

sns.heatmap(edinburgh_summary, cmap="hot")

plt.title("Mean monthly temperature (°C) in Edinburgh since 1960")

# monthly_difference.py

edinburgh_difference = edinburgh_summary.apply(
    lambda month: month - month.mean()
)
edinburgh_difference

# classify_months.py


def classify_month(difference):
    if difference > 1:
        return "+"
    elif difference < -1:
        return "-"
    else:
        return ""


edinburgh_annotation = edinburgh_difference.applymap(classify_month)
edinburgh_annotation

# custom_edinburgh_annotation.py

plt.figure(figsize=(10, 10))

sns.heatmap(edinburgh_summary, cmap="hot", annot=edinburgh_annotation, fmt="s")

title = """Mean monthly temperature (°C) in Edinburgh since 1960
+/- indicates months that are at least 1° warmer/cooler than average
"""

plt.title(title)

# year_summary.py

city_summary = (
    weather.groupby(["City", "Year"])
    ["Mean temperature"]
    .mean()
    .unstack()
    .dropna(axis=1)  # remove years with missing data
)
city_summary

from scipy.spatial.distance import pdist

distances = pdist(city_summary)
distances

from scipy.spatial.distance import squareform

distance_matrix = squareform(distances)
distance_matrix

pd.DataFrame(
    distance_matrix, index=city_summary.index, columns=city_summary.index
)

import fastcluster

linkage_matrix = fastcluster.linkage(distance_matrix)
linkage_matrix

from scipy.cluster.hierarchy import dendrogram

dendrogram(linkage_matrix, labels=city_summary.index)

# simple_clustermap.py

g = sns.clustermap(
    city_summary, col_cluster=False, cmap="hot", figsize=(10, 4)
)

g.fig.suptitle("Mean yearly temperature for three cities", y=1.05)

g = sns.clustermap(city_summary, cmap="hot", figsize=(10, 4))

g.fig.suptitle("Mean yearly temperature for three cities", y=1.05)

4 ** 6

genomes = pd.read_csv("fungi_genomes.csv")
genomes

# kmers_heatmap.py

plt.figure(figsize=(12, 8))
sns.heatmap(genomes.set_index("species").drop(columns=["Class"]), cmap="BuPu")
plt.title("Proportion of kmers in fungal genomes")

# scaled_kmers_heatmap.py

plt.figure(figsize=(12, 8))
sns.heatmap(
    genomes.set_index("species").drop(columns=["Class"]),
    cmap="BuPu",
    vmax=0.002,
)
plt.title("Proportion of kmers in fungal genomes")

# clustered_kmers_heatmap.py

g = sns.clustermap(
    genomes.set_index("species").drop(columns=["Class"]),
    cmap="BuPu",
    vmax=0.002,
    col_cluster=False,
    figsize=(12, 8),
)
g.fig.suptitle("Proportion of kmers in fungal genomes", y=1.05)

# make_colors.py

genomes.set_index("species")["Class"].replace(
    {"Ascomycetes": "red", "Basidiomycetes": "green", "Other Fungi": "blue"}
)

# colored_clustered_heatmap.py

g = sns.clustermap(
    genomes.set_index("species").drop(columns=["Class"]),
    cmap="BuPu",
    vmax=0.002,
    col_cluster=False,
    figsize=(12, 8),
    row_colors=genomes.set_index("species")["Class"].replace(
        {
            "Ascomycetes": "red",
            "Basidiomycetes": "green",
            "Other Fungi": "blue",
        }
    ),
)
g.fig.suptitle(
    """Proportion of kmers in fungal genomes, colored by class
    (red = Ascomycetes, green = Basidiomycetes, blue = Other Fungi)""",
    y=1.05,
)

# monthly_mean_summary.py

weather.groupby(["City", "Month"])["Mean temperature"].mean().unstack()

# monthly_mean_summary_heatmap.py

sns.heatmap(
    weather.groupby(["City", "Month"])["Mean temperature"].mean().unstack(),
    square=True,
    cmap="BuPu",
    cbar_kws={"shrink": 0.5},
)
plt.title("Mean monthly temperature for three cities")

# long_monthly_mean.py

monthly_mean = weather.groupby(["City", "Month"], as_index=False)[
    "Mean temperature"
].mean()
monthly_mean

sns.relplot(
    data=monthly_mean, x="Month", y="City", aspect=4, height=2,
)
plt.xticks(rotation=45, horizontalalignment="right")

g = sns.relplot(
    data=monthly_mean,
    x="Month",
    y="City",
    size="Mean temperature",
    sizes=(20, 200),
    aspect=4,
    height=2,
)
plt.xticks(rotation=45, horizontalalignment="right")
g.ax.set_ylim(-1, 4)
g.fig.suptitle(
    "Mean monthly temperature for three cities", y=1.1,
)

g = sns.relplot(
    data=monthly_mean,
    x="Month",
    y="City",
    size="Mean temperature",
    sizes=(20, 200),
    hue="Mean temperature",
    palette="PuBu",
    aspect=4,
    height=2,
)
plt.xticks(rotation=45, horizontalalignment="right")
g.ax.set_ylim(-1, 4)
g.fig.suptitle(
    "Mean monthly temperature for three cities", y=1.1,
)

# monthly_mean_bubble.py

with plt.rc_context({"axes.grid": False}):
    g = sns.relplot(
        data=monthly_mean,
        x="Month",
        y="City",
        size="Mean temperature",
        sizes=(20, 200),
        hue="Mean temperature",
        palette="PuBu",
        aspect=4,
        height=2,
    )
    plt.xticks(rotation=45, horizontalalignment="right")
    g.ax.set_ylim(-1, 4)
    g.fig.suptitle(
        "Mean monthly temperature for three cities", y=1.1,
    )

# class_counts_bubble.py

with plt.rc_context({"legend.labelspacing": 1.2, "axes.grid": False}):

    g = sns.relplot(
        data=euk.groupby(["Class", "Assembly status"], as_index=False)[
            "Species"
        ].count(),
        x="Class",
        y="Assembly status",
        size="Species",
        sizes=(20, 400),
        hue="Assembly status",
        palette="Dark2",
        aspect=4,
        height=3,
    )
    plt.xticks(rotation=45, horizontalalignment="right")
    g.fig.suptitle(
        "Number of genomes in different assembly status for each eukaryote class",
        y=1.1,
    )
    g.ax.set_ylim(-1, 4)