%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

# you'll see this alias in documentation, examples, etc.
import pandas as pd

pd.Series([1, 1, 2, 3, 5, 8, 13])

pd.Series([3.1415, 2.7182, 1.4142])

pd.Series(["apple", "banana", "orange"])

pd.Series(["apple", "banana", "orange"], dtype="string")

# simple_dataframe.py

s1 = pd.Series(["a", "b", "c", "d"], dtype="string")
s2 = pd.Series([2, 4, 8, 16])
pd.DataFrame({"letter": s1, "number": s2})

df = pd.DataFrame({"letter": s1, "number": s2})
df.dtypes

pd.set_option("max_columns", 10)

# read_file.py

euk = pd.read_csv("eukaryotes.tsv", sep="\t")
euk

pd.set_option("max_columns", 5)

euk

print(euk)

euk.dtypes

my_types = {
    "Species": "string",
    "Kingdom": "string",
    "Class": "string",
    "Assembly status": "string",
}

euk = pd.read_csv("eukaryotes.tsv", sep="\t", dtype=my_types)

euk.dtypes

pd.set_option("max_columns", 10)

euk.tail()

euk = pd.read_csv("eukaryotes.tsv", sep="\t", dtype=my_types, na_values=["-"])

euk.tail()

pd.set_option("max_columns", 5)

euk.dtypes

# read_file_with_types.py

my_types = {
    "Species": "string",
    "Kingdom": "string",
    "Class": "string",
    "Assembly status": "string",
    "Number of genes": "Int64",
    "Number of proteins": "Int64",
}

euk = pd.read_csv("eukaryotes.tsv", sep="\t", dtype=my_types, na_values=["-"])

euk.dtypes

euk.info()

euk.describe()

# show the first three rows
euk.head(3)

type(euk)

len(euk)

pd.set_option("max_columns", 10)

# read_file_with_names.py

my_types = {
    "Species": "string", "Kingdom": "string", "Class": "string", "Assembly status": "string",
    "Number of genes": "Int64", "Number of proteins": "Int64",
}

my_names = [
    "species", "kingdom", "class", "size", "gc", "genes",
    "proteins", "year", "status",
]

pd.read_csv(
    "eukaryotes.tsv",
    sep="\t",
    dtype=my_types,
    na_values=["-"],
    names=my_names,
).head()

df = pd.read_csv(
    "eukaryotes.tsv",
    sep="\t",
    na_values=["-"],
    dtype={"Number of genes": "Int64", "Number of proteins": "Int64"},
)


df.columns = my_names
df.head()

pd.set_option("max_columns", 5)