%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

s1 = pd.Series(["a", "b", "c", "d"], dtype="string")
s2 = pd.Series([2, 4, 8, 16])
small_df = pd.DataFrame({"letter": s1, "number": s2})
small_df

mask = [False, True, True, False]

small_df[mask]

mask = euk["Size (Mb)"] > 500
mask

euk[mask]

len(euk[mask])

# simple_filter.py

euk[euk["Size (Mb)"] > 500]

# find genomes belonging to a single species
euk[euk["Species"] == "Penicillium expansum"]

# find all genomes that are not fish
euk[euk["Class"] != "Fishes"]

"apple" in ["apple", "banana", "orange"]

# find bird and fish genomes
euk["Class"] in ["Birds", "Fish"]

# list_filter.py

# find all bird and fish genomes
euk[euk["Class"].isin(["Birds", "Fish"])]

"banana".startswith("b")

# find genomes for species starting with the letter Q
euk["Species"].startswith("Q")

# string_filter.py

# find genomes for species starting with the letter Q

euk[euk["Species"].str.startswith("Q")]

number = 42

# is my number between 10 and 100?
number > 10 and number < 100

# find genomes between a hundred and a thousand megabases
(euk["Size (Mb)"] > 100) & (euk["Size (Mb)"] < 1000)

# complex_filter.py

# find genomes between a hundred and a thousand megabases
euk[(euk["Size (Mb)"] > 100) & (euk["Size (Mb)"] < 1000)]

# find genomes between a hundred and a thousand megabases
euk[euk["Size (Mb)"].between(100, 1000)]

# find genomes that are not birds of fishes, or have low GC
euk[~(euk["Class"].isin(["Birds", "Fishes"])) | (euk["GC%"] < 40)]

euk["Number of genes"] > 10_000

(euk["Number of genes"] > 10_000).value_counts()

(euk["Number of genes"] > 10_000).value_counts(dropna=False)

len(euk[euk["Number of genes"] > 10_000])

len(euk[euk["Number of genes"].fillna(20000) > 10_000])

# what are the sizes of all the human genomes?
euk[euk["Species"] == "Homo sapiens"]["Size (Mb)"]

# filter_select_aggregate.py

euk[euk["Class"] == "Birds"]["Size (Mb)"].mean()

(
    euk[euk["Class"] == "Birds"] # find bird genomes
    ["Size (Mb)"] # get size column
    .mean() # calculate mean
)

name = "Gorilla gorilla"
name[:10].upper().count("G")

name = "Gorilla gorilla"
(
    name[:10]  # get the first ten letters of the name
    .upper()  # change it to upper case
    .count("G")  # count the number of Gs
)

# what's the median number of genes for genomes with low GC?
euk[euk["GC%"] < 50]["Number of genes"].median()

# how many genomes were sequenced for each kingdom in 2010?
euk[euk["Publication year"] == 2010]["Kingdom"].value_counts()

# what's the largest number of proteins for fish genomes from 2017
(
    euk[
        (euk["Class"] == "Fishes") & (euk["Publication year"] == 2017)
    ]
    ["Number of proteins"]
    .max()
)

animals = euk[euk["Kingdom"] == "Animals"]

# do something with the animals dataframe