%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

# TODO make sure that this all fits with new dtype stuff

pd.read_csv("awkward_input.csv", na_values=["-"], skiprows=11)

pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-"],
    names=["Species", "Genome size", "GC%", "Genes", "Year",
        "Status", "Is animal", "Is plant"]
)

my_columns = ["Species", "Genome size", "GC%", "Genes", "Year",
    "Status", "Is animal", "Is plant"]

pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-"],
    names=my_columns,
    usecols=my_columns,
)

df = pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    names=my_columns,
    na_values=["-"],
    usecols=my_columns,
    skipfooter=1,
)
df

df.info()

df["Genome size"]

df = pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-"],
    names=my_columns,
    usecols=my_columns,
    skipfooter=1,
    decimal=",",
)
df.info()

df["GC%"].head()

df["GC%"].value_counts().head()

df = pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-", "missing"],
    names=my_columns,
    usecols=my_columns,
    skipfooter=1,
    decimal=",",
)
df.info()

df["Genes"]

df = pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-", "missing"],
    names=my_columns,
    usecols=my_columns,
    skipfooter=1,
    decimal=",",
    thousands="_",
)
df.info()

df[df["Genome size"].isnull()]

df = pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-", "missing"],
    names=my_columns,
    usecols=my_columns,
    skipfooter=1,
    decimal=",",
    thousands="_",
    comment="#",
)
df.info()

df = pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-", "missing"],
    names=my_columns,
    usecols=my_columns,
    skipfooter=1,
    decimal=",",
    thousands="_",
    comment="#",
    dtype={"Is animal": bool, "Is plant": bool},
)
df.info()

df["Is animal"].value_counts()

df = pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-", "missing"],
    names=my_columns,
    usecols=my_columns,
    skipfooter=1,
    decimal=",",
    thousands="_",
    comment="#",
)
df["Is animal"].value_counts()

for value in df["Is animal"].unique():
    print(repr(value), bool(value))

# import_awkward_data_file.py

import numpy as np


df = pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-", "missing"],
    names=my_columns,
    usecols=my_columns,
    skipfooter=1,
    decimal=",",
    thousands="_",
    comment="#",
    true_values=["True", "1", "yes", '"t"'],
    false_values=["False", "0", "no", '"f"'],
    dtype={"Is animal": np.bool, "Is plant": np.bool},
)
df.info()

df[["Is animal", "Is plant"]].apply(lambda x: x.value_counts())

london_rain = pd.read_csv("London_daily_rain.csv")
london_rain.head()

berlin_rain = pd.read_csv("Berlin_daily_rain.csv")
edinburgh_rain = pd.read_csv("Edinburgh_daily_rain.csv")

all_dataframes = [london_rain, berlin_rain, edinburgh_rain]

for df in all_dataframes:
    print(len(df))

for df in all_dataframes:
    print(df.columns)

big_df = pd.concat(all_dataframes)
big_df

len(big_df)

big_df[
    (big_df["Year"] == 1981)
    & (big_df["Month"] == "May")
    & (big_df["Day of month"] == 27)
]

london_rain["City"] = "London"
edinburgh_rain["City"] = "Edinburgh"
berlin_rain["City"] = "Berlin"

big_df = pd.concat(all_dataframes)
big_df

big_df[
    (big_df["Year"] == 1981)
    & (big_df["Month"] == "May")
    & (big_df["Day of month"] == 27)
]

# import_all_rain_files.py

import os

# this will hold our dataframes
all_dataframes = []

for filename in os.listdir():

    # only read files matching the pattern
    if filename.endswith("_daily_rain.csv"):
        print(f"reading {filename}...")
        df = pd.read_csv(filename)

        # take the bit of the filename before the underscore
        df["City"] = filename.split("_")[0]

        # add each dataframe to our list
        all_dataframes.append(df)

# now concatenate all the dataframes in the list
big_df = pd.concat(all_dataframes)
big_df

# monthly_rain_plot.py

from pandas.api.types import CategoricalDtype

months = ["January", "February", "March", "April", "May", "June", "July", "August",
    "September", "October", "November", "December"]

big_df["Month"] = big_df["Month"].astype(
    CategoricalDtype(categories=months, ordered=True)
)

sns.relplot(
    data=big_df,
    x="Month",
    y="Rainfall (mm)",
    aspect=3,
    height=4,
    hue="City",
    kind="line",
)

plt.title("Mean daily rainfall in each month since 1960 for three cities")

weather

# set_temp_index.py

temperatures = (
    weather
    .set_index(["City", "Year", "Day of year"])
    ["Mean temperature"]
)
temperatures

# set_rain_index.py

big_df = big_df.set_index(["City", "Year", "Day of year"])
big_df

temperatures.index.equals(big_df.index)

# add_temps.py

big_df["Temperature (°C)"] = temperatures
big_df

big_df = big_df.reset_index()
big_df

# temp_and_rain.py

g = sns.lmplot(
    data=big_df,
    col="City",
    x="Rainfall (mm)",
    y="Temperature (°C)",
    scatter_kws={"s": 1, "color": "lightgrey"},
    height=4,
)
g.fig.suptitle(
    "Daily rainfall vs daily temperature since 1960 for three cities", y=1.1
)

big_df.to_csv("all_weather.csv", index=False)

euk.head()

names = pd.read_csv("common_names.csv")
names

# simple_merge.py

merged = euk.merge(names, left_on="Species", right_on="scientific name")
merged.head()

merged.head()

names[names["scientific name"] == "Arabidopsis thaliana"]

names.drop_duplicates(subset=["scientific name"])

# unique_merge.py

euk.merge(
    names.drop_duplicates(subset=["scientific name"]),
    left_on="Species",
    right_on="scientific name",
)

# list_names.py

list_common_names = (
    names.groupby("scientific name")
    .apply(lambda x: ",".join(x["common name"]))
    .to_frame("common name")
    .reset_index()
)
list_common_names

# list_merge.py

euk.merge(list_common_names, left_on="Species", right_on="scientific name")

euk.merge(
    list_common_names,
    left_on="Species",
    right_on="scientific name",
    how="inner",
)

euk.merge(
    list_common_names,
    left_on="Species",
    right_on="scientific name",
    how="left",
)

pd.set_option("max_columns", 5)

euk.merge(
    list_common_names,
    left_on="Species",
    right_on="scientific name",
    how="right",
)

euk.merge(
    list_common_names,
    left_on="Species",
    right_on="scientific name",
    how="outer",
)

final_merge = euk.merge(
    list_common_names,
    left_on="Species",
    right_on="scientific name",
    how="left",
).fillna({"common name": "no common name"})
final_merge

pd.set_option("max_columns", 7)

display_names = (
    final_merge["Species"] + " (" + final_merge["common name"] + ")"
)
display_names

final_merge["display_name"] = display_names.str.wrap(20)
final_merge["display_name"]

# display_name_plot.py

my_data = final_merge.groupby("Species").filter(lambda x: len(x) > 30)

sns.catplot(
    data=my_data,
    x="display_name",
    y="GC%",
    kind="bar",
    aspect=3,
    height=5,
    color="lightblue",
    ci="sd",
)

plt.title(
    "Mean and standard deviation of GC% for species with >30 sequenced genomes"
)

weather = pd.read_csv("all_weather.csv")
weather

weather.info()

# show_mem_usage.py

weather.memory_usage()

weather.memory_usage() / 1000

weather.memory_usage() / 1_000_000

weather.memory_usage().sum() / 1_000_000

# show_real_mem_usage.py

weather.memory_usage(deep=True) / 1_000_000

weather.info(memory_usage="deep")

# just_some_columns.py

weather = pd.read_csv(
    "all_weather.csv",
    usecols=["City", "Year", "Day of year", "Rainfall (mm)", "Temperature (°C)"]
)

print(weather.memory_usage(deep=True))
print(weather.memory_usage(deep=True).sum() / 1_000_000)

# categories.py

weather = pd.read_csv(
    "all_weather.csv",
    usecols=["City", "Year", "Day of year", "Rainfall (mm)", "Temperature (°C)"],
    dtype={"City": "category"},
)

print(weather.memory_usage(deep=True))
print(weather.memory_usage(deep=True).sum() / 1_000_000)

euk["Species"].memory_usage(deep=True) / 1000

euk["Species"].astype("category").memory_usage(deep=True) / 1000

euk["Species"].count(), euk["Species"].nunique()

pd.read_csv("eukaryotes.tsv", sep="\t", na_values=["-"]).dtypes

pd.read_csv("eukaryotes.tsv", sep="\t", na_values=["-"])[
    "Species"
].memory_usage(deep=True) / 1000

weather.info()

2 ** 64

2 ** 64 // 2

2 ** 16 / 2

# reduced_precision.py

weather = pd.read_csv(
    "all_weather.csv",
    usecols=["City", "Year", "Day of year", "Rainfall (mm)", "Temperature (°C)"],
    dtype={"City": "category", "Year": "int16", "Day of year": "int16"},
)

print(weather.memory_usage(deep=True))
print(weather.memory_usage(deep=True).sum() / 1_000_000)

weather = pd.read_csv("all_weather.csv")
for data_type in ["category", "int64", "int32", "int16", "int8"]:
    usage = weather["Year"].astype(data_type).memory_usage(deep=True) / 1000
    print(data_type, usage)

weather["Year"].astype("int8").head()

weather = pd.read_csv("all_weather.csv")
for data_type in ["category", "int64", "int32", "int16", "int8"]:
    usage = (
        weather["Day of year"].astype(data_type).memory_usage(deep=True) / 1000
    )
    print(data_type, usage)

df = pd.read_csv(
    "awkward_input.csv",
    skiprows=11,
    na_values=["-", "missing"],
    names=my_columns,
    usecols=my_columns,
    skipfooter=1,
    decimal=",",
    thousands="_",
    comment="#",
    true_values=["True", "1", "yes", '"t"'],
    false_values=["False", "0", "no", '"f"'],
    dtype={"Is animal": np.bool, "Is plant": np.bool},
)
df.memory_usage(deep=True)

weather.dtypes

weather["Rainfall (mm)"].head()

print(weather["Rainfall (mm)"].memory_usage(deep=True) / 1_000)
print(weather["Rainfall (mm)"].astype("Int64").memory_usage(deep=True) / 1_000)

print(weather["Rainfall (mm)"].astype("Int16").memory_usage(deep=True) / 1_000)

all(
    weather["Rainfall (mm)"].dropna().astype("Int16")
    == weather["Rainfall (mm)"].dropna()
)

weather["Temperature (°C)"].dtype

weather["Temperature (°C)"].head()

all(
    weather["Temperature (°C)"].dropna().astype("float32")
    == weather["Temperature (°C)"].dropna()
)

max(
    np.abs(
        weather["Temperature (°C)"]
        - weather["Temperature (°C)"].astype("float32")
    )
)

max(
    np.abs(
        weather["Temperature (°C)"]
        - weather["Temperature (°C)"].astype("float16")
    )
)

print(weather["Temperature (°C)"].memory_usage(deep=True) / 1_000)
print(
    weather["Temperature (°C)"].astype("float16").memory_usage(deep=True)
    / 1_000
)

# final_memory_save.py

weather = pd.read_csv(
    "all_weather.csv",
    usecols=[
        "City",
        "Year",
        "Day of year",
        "Rainfall (mm)",
        "Temperature (°C)",
    ],
    dtype={
        "City": "category",
        "Year": "int16",
        "Day of year": "int16",
        "Rainfall (mm)": "Int16",
        "Temperature (°C)": "float16",
    },
)

weather.info(memory_usage="deep")

weather = pd.read_csv("all_weather.csv")
rainy_days = weather[weather["Rainfall (mm)"] > 100]

rainy_days.info(memory_usage="deep", verbose=False)

weather.info(memory_usage="deep", verbose=False)

# chunk_mem.py

for chunk in pd.read_csv("all_weather.csv", chunksize=10000):
    print(len(chunk), chunk.memory_usage(deep=True).sum() / 1_000_000)

# chunk_filter.py

for chunk in pd.read_csv("all_weather.csv", chunksize=10000):
    chunk_rainy_days = chunk[chunk["Rainfall (mm)"] > 100]
    print(f"found {len(chunk_rainy_days)} rainy days")

# chunk_concat.py

chunk_rainy_days_list = []

for chunk in pd.read_csv("all_weather.csv", chunksize=10000):
    chunk_rainy_days = chunk[chunk["Rainfall (mm)"] > 100]
    chunk_rainy_days_list.append(chunk_rainy_days)

all_chunk_rainy_days = pd.concat(chunk_rainy_days_list)
all_chunk_rainy_days.head()

rainy_days.equals(all_chunk_rainy_days)

# chunk_months.py

for chunk in pd.read_csv("all_weather.csv", chunksize=10000):
    chunk_rainy_days = chunk[chunk["Rainfall (mm)"] > 100]
    print(chunk_rainy_days["Month"].value_counts().to_dict())

# chunk_dict.py

month_counts = {}

for chunk in pd.read_csv("all_weather.csv", chunksize=10000):
    chunk_rainy_days = chunk[chunk["Rainfall (mm)"] > 100]
    for month, count in (
        chunk_rainy_days["Month"].value_counts().to_dict().items()
    ):

        # look up the current count for the month, with default
        # of zero in case it's the first time we've seen this month
        current_month_count = month_counts.get(month, 0)

        # store the updated count, which is the current count
        # plus the count for this chunk
        month_counts[month] = current_month_count + count
month_counts