%run -i "standard_header.py"
%matplotlib inline

import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 300

con = pd.read_csv("contigs.csv")
con.head()

len(con)

# all_contigs.py

g = sns.relplot(data=con, x="GC", y="length")
plt.title("GC content vs length for all contigs", pad=20)

# filtered_contigs.py

sns.relplot(
    data=con[con["length"] < 100_000],
    x="GC",
    y="length"
)

plt.title("GC content vs length\nfor contigs < 100 Kb", pad=20)

# transparent_contigs.py

sns.relplot(
    data=con[con["length"] < 100_000], 
    x="GC", 
    y="length", 
    alpha=0.01
)

plt.title("GC content vs length\nfor contigs < 100 Kb", pad=20)

# small_points_contigs.py

sns.relplot(
    data=con[con["length"] < 100_000], 
    x="GC", 
    y="length", 
    s=1, 
    linewidth=0
)
plt.title("GC content vs length\nfor contigs < 100 Kb", pad=20)

# sampled_contigs.py

sns.relplot(
    data=con[con["length"] < 100_000].sample(1000),
    x="GC",
    y="length"
)

plt.title(
    "GC content vs length for a random sample\n of 1000 contigs < 100 Kb",
    pad=20, # allow a bit more space for a two line title
)

# just contigs less than 100Kb
g = sns.jointplot(
    data=con[con["length"] < 100_000],
    x="GC",
    y="length",
    kind="hex",
    gridsize=30,  # how many hexes wide and high
)

# set a supertitle for the figure
g.fig.suptitle("GC content vs length for all contigs < 100 Kb", y=1.05)

# hexplot.py

# just contigs less than 40Kb
g = sns.jointplot(
    data=con[con["length"] < 40000],
    x="GC",
    y="length",
    kind="hex",
    gridsize=30,  # how many hexes wide and high
)
# set a supertitle for the figure
g.fig.suptitle("GC content vs length for all contigs < 40 Kb", y=1.05)

# kdeplot.py

# just contigs less than 40Kb
g = sns.jointplot(
    data=con[con["length"] < 40000],
    x="GC",
    y="length",
    kind="kde",
    n_levels=10,
    shade=False,
    color="red",
)
g.fig.suptitle(
    "Contour plot of GC content vs length\n for all contigs < 40 Kb", y=1.05
)

# shaded_kdeplot.py

# just contigs less than 40Kb
g = sns.jointplot(
    data=con[con["length"] < 40000],
    x="GC",
    y="length",
    kind="kde",
    n_levels=20,
    shade=True,
    cmap="OrRd",
)
g.fig.suptitle(
    "Contour plot of GC content vs length\n for all contigs < 40 Kb", y=1.05
)

# just contigs less than 40Kb
g = sns.jointplot(
    data=con[con["length"] < 40000],
    x="GC",
    y="length",
    kind="kde",
    n_levels=20,
    shade=True,
    cmap="gist_earth",
)
g.fig.suptitle(
    "Island plot of GC content vs length\n for all contigs < 40 Kb", y=1.05
)

euk_float = euk[euk["Size (Mb)"] < 4_000].dropna()
euk_float["Number of genes"] = euk_float["Number of genes"].astype(float)
euk_float["Number of proteins"] = euk_float["Number of proteins"].astype(float)
euk_float["Publication year"] = euk_float["Publication year"].astype(float)
euk_float.info()

# linear_regression_plot.py

sns.lmplot(
    data=euk_float, x="Size (Mb)", y="Number of genes", height=7,
)

plt.title("Regression of size vs number of genes\nfor genomes < 4Gb")

# polynomial_regression_plot.py

sns.lmplot(
    data=euk_float,
    x="Size (Mb)",
    y="Number of genes",
    height=7,
    ci=None,
    order=2,
)

plt.title(
    "First order polynomial regression of size vs number of genes\nfor genomes < 4Gb"
)

from scipy import stats


stats.linregress(
    euk_float["Size (Mb)"], euk_float["Number of genes"],
)

# pair_plot.py

g = sns.pairplot(
    data=euk_float,
    plot_kws={"s": 5},  # draw small points
    height=1.5,  # make the individual plots small
)
g.fig.suptitle(
    "Pairwise relationships in the eukaryote genome dataset", y=1.05
)

# styled_pair_plot.py

g = sns.pairplot(
    data=euk_float,
    kind="reg",
    plot_kws={"scatter_kws": {"color": "lightgrey", "s": 5}},
    height=1.5,
)

g.fig.suptitle("Pairwise regressions in the eukaryote genome dataset", y=1.05)

# polynomial_pair_polot.py

g = sns.pairplot(
    data=euk_float,
    kind="reg",
    plot_kws={"scatter_kws": {"color": "lightgrey", "s": 5}, "order": 2},
    height=1.5,
)
g.fig.suptitle(
    "Pairwise polynomial regressions in the eukaryote genome dataset", y=1.05
)