%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

euk["Size (Mb)"]

euk.Kingdom

sizes = euk["Size (Mb)"]

# iterate_over_series.py

sizes = euk["Size (Mb)"]

# use the first ten values of the series in a loop
for s in sizes[:10]:
    print("one size is " + str(s) + " Megabases")

# what's the biggest value?
max(sizes)

# descriptive_statisics_series.py

sizes = euk["Size (Mb)"]

print("The minimum size is " + str(sizes.min()))
print("The maximum size is " + str(sizes.max()))
print("The mean size is " + str(sizes.mean()))
print("The median size is " + str(sizes.median()))
print("The standard deviation of the sizes is " + str(sizes.std()))
print("The skey of the size is " + str(sizes.skew()))
print("The 90th percentile size is " + str(sizes.quantile(0.9)))

sizes.nlargest(10)

sizes.sample(5)

sizes.head()

euk["Species"].value_counts()

sizes * 1_000_000

# calculate AT content
1 - (euk["GC%"] / 100)

# broadcasting_series.py

# calculate number of genes per kilobase
euk["Number of genes"] / (euk["Size (Mb)"] * 1000)

import math

math.log(sizes)

import numpy as np

np.log([1, 2, 3, 4])

np.log(sizes)

# string_series.py

# we can concatenate strings
euk["Species"] + " (" + euk["Class"] + ")"

# get species name in upper case
euk["Species"].upper()

# string_methods.py

# get species name in upper case
euk["Species"].str.upper()

np.log(euk["Size (Mb)"]).mean()

# get the median gene density
(euk["Number of genes"] / (euk["Size (Mb)"] * 1000)).median()

# get the median gene density
densities = euk["Number of genes"] / (euk["Size (Mb)"] * 1000)
densities.median()

euk[["Species", "Size (Mb)", "Number of genes"]].head()

np.log(euk["Size (Mb)"]).nlargest(3)

euk["GC%"]

euk.set_index("Species").head()

# series_index.py

# show 5 species with the most genes
euk.set_index("Species")["Number of genes"].nlargest()

euk.set_index("Species")["Number of genes"].sort_values()

euk.set_index("Species")["Number of genes"].sort_index()

# remove_missing_data.py

euk.set_index("Species")["Number of genes"].sort_index().dropna()

# multiline_method_chains.py

(
    euk # start with the dataframe
    .set_index("Species") # set the index to be the species name
    ["Number of genes"]  # get the number of genes
    .sort_index()  # sort the series by the index
    .dropna()  # remove any missing data
)

euk.head()

s = "Python"
s.upper()  # this doesn't change the original variable
s

# is the size greater than 500 megabases?
euk["Size (Mb)"] > 500