%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

euk.groupby(["Kingdom", "Assembly status"]).size()

# long_summary.py

counts = (
    euk.groupby(["Kingdom", "Assembly status"])
    .size()
    .to_frame("Number of genomes")
    .reset_index()
)
counts

# long_summary_plot.py

sns.catplot(
    data=counts,
    x="Kingdom",
    y="Number of genomes",
    hue="Assembly status",
    kind="bar",
)
plt.title("Number of genomes in different assembly status for each kingdom")

# long_summary_cols.py

g = sns.catplot(
    data=counts,
    y="Kingdom",
    x="Number of genomes",
    col="Assembly status",
    kind="strip",
    height=3,
)
g.set_titles("{col_name}")
g.fig.suptitle(
    "Number of genomes in different assembly status for each kingdom", y=1.1
)

euk.groupby(["Kingdom", "Assembly status"]).size()

# wide_summary.py

euk.groupby(["Kingdom", "Assembly status"]).size().unstack()

euk.groupby(["Kingdom", "Assembly status"]).size().unstack(0)

euk.groupby(["Kingdom", "Assembly status"]).size().unstack(1)

pd.set_option("max_rows", 50)

(
    euk.groupby(["Kingdom", "Assembly status"])
    .size()
    .to_frame("Number of genomes")
    .reset_index()
)

pd.set_option("max_rows", 10)

summary = euk.groupby(["Kingdom", "Assembly status"]).size().unstack()
summary

summary["Contig"] / summary["Chromosome"]

# animals_plants.py

animals_vs_plants = (
    euk[
        (euk["Kingdom"].isin(["Animals", "Plants"]))
        & (euk["Assembly status"].isin(["Contig", "Scaffold"]))
    ]
    .groupby(["Kingdom", "Assembly status"])
    .size()
    .unstack()
)
animals_vs_plants

# chi_square.py

import scipy.stats as stats

stats.chi2_contingency(animals_vs_plants)

# make_subset.py

subset = euk[
    ["Species", "Kingdom", "Class", "Number of genes", "Number of proteins"]
]
subset.head()

# melt.py

long_subset = subset.melt(
    id_vars=["Species", "Kingdom", "Class"],
    var_name="Measurement type",
    value_name="Count",
)
long_subset

euk["Number of proteins"] / euk["Number of genes"]

# plot_melt.py

sns.catplot(
    data=long_subset.dropna(),
    x="Kingdom",
    y="Count",
    hue="Measurement type",
    kind="box",
)

plt.title(
    "Distributions of numbers of genes and proteins\nfor genomes in different kingdoms"
)

london_rain = pd.read_csv("london_rainfall.csv")
london_rain

london_rain["June"] - london_rain["January"]

london_rain["January -> June difference"] = (
    london_rain["June"] - london_rain["January"]
)

# plot_london_rain.py

london_rain["January -> June difference"] = (
    london_rain["June"] - london_rain["January"]
)

sns.distplot(london_rain["January -> June difference"], color="lightblue")
plt.title(
    """Distribution of difference in total rainfall
between January and June in London"""
)

london_rain[london_rain["June"] < london_rain["January"]]

(london_rain["June"] < london_rain["January"]).value_counts()

# tidy_rain.py

london_rain = pd.read_csv("london_rainfall.csv")


tidy_rain = london_rain.melt(
    id_vars=["Year"], var_name="Month", value_name="Rainfall (mm)"
)
tidy_rain

# plot_tidy_rain.py

g = sns.catplot(
    data=tidy_rain,
    x="Month",
    y="Rainfall (mm)",
    kind="point",
    aspect=3,
    height=3.5,
)

g.fig.suptitle("Mean total monthly rainfall in London since 1960")

g = sns.relplot(
    data=tidy_rain,
    x="Month",
    y="Rainfall (mm)",
    kind="line",
    aspect=3,
    height=3.5,
)

g.fig.suptitle("Mean total monthly rainfall in London since 1960")

# set_month_category.py

from pandas.api.types import CategoricalDtype

months = ["January",  "February", "March", "April", "May", "June", "July", "August",
          "September", "October", "November", "December"]

tidy_rain["Month"] = tidy_rain["Month"].astype(
    CategoricalDtype(categories=months, ordered=True)
)

# plot_month_category.py

g = sns.relplot(
    data=tidy_rain,
    x="Month",
    y="Rainfall (mm)",
    kind="line",
    aspect=3,
    height=3.5,
)

g.fig.suptitle("Mean total monthly rainfall in London since 1960")

euk["Species"].value_counts()

euk.groupby(["Kingdom", "Assembly status"])["GC%"].median()

euk.groupby(["Kingdom", "Assembly status"])["GC%"].median().reset_index()

# plot_median_gc.py

sns.catplot(
    data=euk.groupby(["Kingdom", "Assembly status"])["GC%"]
    .median()
    .reset_index(),
    x="Kingdom",
    y="GC%",
    hue="Assembly status",
    kind="bar",
)
plt.title("Median GC percentage for each kingdom\ngrouped by assembly status")

euk2 = euk.set_index("Species")
euk2

euk2.loc["Arabidopsis thaliana"]

euk[euk["Species"] == "Arabidopsis thaliana"]

# weather_index.py

weather = pd.read_csv("weather.csv")

temperatures = (
    weather
    .set_index(["Year", "Month", "Day of month", "City"])
    ["Mean temperature"]
)

temperatures

temperatures.loc[1972, "September", 7, "London"]

temperatures.loc[1972, "September", 7]

temperatures.loc[1972]

temperatures.loc["Berlin"]

temperatures.loc[:, :, :, "Berlin"]

s = "abcdefghijklm"

# just get characters 4 to 8
s[4:8]

s[:]

temperatures.loc[:, "July", 18, "Berlin"]

temperatures.sort_index(inplace=True)

temperatures.loc[1978:1982, "July", 18, "Berlin"]

temperatures.loc[1978:1982:2, "July", 18, "Berlin"]

temperatures.loc[1996, "May":"September", 1, "Edinburgh"]

# set_month_order.py

from pandas.api.types import CategoricalDtype

months = ["January", "February", "March", "April", "May", "June", "July", "August",
    "September", "October", "November", "December"]

weather["Month"] = weather["Month"].astype(
    CategoricalDtype(categories=months, ordered=True)
)

temperatures = weather.set_index(["Year", "Month", "Day of month", "City"])[
    "Mean temperature"
].sort_index()

temperatures.loc[1996, "May":"September", 1, "Edinburgh"]

weather[
    (weather["Year"] == 1996)
    & (weather["Month"].between("May", "September"))
    & (weather["Day of month"] == 1)
    & (weather["City"] == "Edinburgh")
]