%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

euk[euk["Kingdom"] == "Fungi"]

euk[(euk["Kingdom"] == "Fungi") & (euk["Size (Mb)"] > 100)]

euk[(euk["Kingdom"] == "Fungi") & (euk["Size (Mb)"] > 100)]["Species"]

# fungal_species_names.py

species = (
    euk[
        (euk["Kingdom"] == "Fungi") & (euk["Size (Mb)"] > 100)
    ]
    ["Species"]
    .to_list()
)

# get first 10 elements
species[:10]

euk["Kingdom"].value_counts()

euk[euk["Kingdom"] == "Plants"]

euk[euk["Kingdom"] == "Plants"]["Species"].nunique()

for kingdom in ["Plants", "Animals", "Fungi", "Protists", "Other"]:
    print(kingdom, euk[euk["Kingdom"] == kingdom]["Species"].nunique())

euk["Kingdom"].unique()

# kingdom_counts.py

for kingdom in euk["Kingdom"].unique():
    print(kingdom, euk[euk["Kingdom"] == kingdom]["Species"].nunique())

euk[euk["Genus"] == "Aquila"]

euk["Species"].str.split(" ")

euk["Species"].str.split(" ").str[0]

euk["Species"].str.split(" ").str[0] == "Aquila"

# genus_filter.py

euk[euk["Species"].str.split(" ").str[0] == "Aquila"]

euk[euk["Species"].str.startswith("Aquila")]

euk[euk["Species"].str.startswith("Aquila ")]

euk["Genus"] = euk["Species"].str.split(" ").str[0]
euk[["Species", "Genus", "Class", "Kingdom"]]

euk["Assembly status"].value_counts().head(1)

(
    euk[euk["Class"] == "Insects"]
    ["Assembly status"]
    .value_counts()
    .head(1)
)


euk[euk["Class"] == "Amphibians"]["Assembly status"].value_counts().head(1)

# class_assembly_summary.py

for class_name in euk["Class"].unique():
    top_status = (
        euk[euk["Class"] == class_name]["Assembly status"]
        .value_counts()
        .index[0]
    )
    print(f"Most {class_name} genomes are assmbled to {top_status} status")

euk["Number of proteins"] / euk["Number of genes"]

(euk["Number of proteins"] / euk["Number of genes"]) >= 1.1

# more_proteins.py

euk[(euk["Number of proteins"] / euk["Number of genes"]) >= 1.1]

euk["Proteins per gene"] = euk["Number of proteins"] / euk["Number of genes"]

euk[euk["Proteins per gene"] >= 1.1][
    [
        "Species",
        "Class",
        "Number of proteins",
        "Number of genes",
        "Proteins per gene",
    ]
]