# Imports
import os
import math
import numpy
import yaml
import scipy.stats
import matplotlib.colors
import altair as alt
import pandas as pd
import plotnine as p9
import dms_variants.codonvarianttable

# Create color palette
def color_gradient_hex(start, end, n):
    """Color function from polyclonal"""
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
            name="_", colors=[start, end], N=n
        )
    return [matplotlib.colors.rgb2hex(tup) for tup in cmap(list(range(0, n)))]

# Orange to white to blue color gradient
orangeblue = color_gradient_hex("#E69F00", "white", n=20) + color_gradient_hex("white", "#0072B2", n=20)

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# this cell is tagged as `parameters` for papermill parameterization
libA_1  = None
libA_2  = None
libA_3  = None
libA_4  = None

libB_1  = None
libB_2  = None
libB_3  = None
libB_4  = None

summary_dir = None
score_dir = None

html_dir = None
html_output = None

# Parameters
libA_1 = "LibA-220823-293T-1"
libA_2 = "LibA-220823-293T-2"
libA_3 = "LibA-220907-293T-1"
libA_4 = "LibA-220907-293T-2"
libB_1 = "LibB-220823-293T-1"
libB_2 = "LibB-220823-293T-2"
libB_3 = "LibB-220907-293T-1"
libB_4 = "LibB-220907-293T-2"
summary_dir = "results/func_scores/"
score_dir = "results/func_scores/"
html_dir = "results/averaged_func_scores_ridgeplot/"
html_output = (
    "results/averaged_func_scores_ridgeplot/averaged_func_scores_ridgeplot.html"
)

# # Uncomment for running interactive
# libA_1 = "LibA-220823-293T-1"
# libA_2 = "LibA-220823-293T-2"
# libA_3 = "LibA-220907-293T-1"
# libA_4 = "LibA-220907-293T-2"

# libB_1 = "LibB-220823-293T-1"
# libB_2 = "LibB-220823-293T-2"
# libB_3 = "LibB-220907-293T-1"
# libB_4 = "LibB-220907-293T-2"

# summary_dir = "../results/func_scores/"
# score_dir = "../results/func_scores/"

# html_dir = "../results/averaged_func_scores_ridgeplot/"
# html_output = "../results/averaged_func_scores_ridgeplot/averaged_func_scores_ridgeplot.html"

# Selection names
selections = [
    libA_1,
    libA_2,
    libA_3,
    libA_4,
    libB_1,
    libB_2,
    libB_3,
    libB_4,
]

# Read and concat all count summary files
count_summaries = pd.concat(
    [pd.read_csv(summary_dir + s + "_count_summary.csv") for s in selections],
    ignore_index=True,
)

# Read and concat all func scores files
func_scores = pd.concat(
    [
        pd.read_csv(score_dir + s + "_func_scores.csv").assign(selection=s)
        for s in selections
    ],
    ignore_index=True,
)

# Create selection filters for plots
selectors = [
    alt.selection_point(
        fields=[sel],
        bind=alt.binding_select(
            options=[None] + sorted(count_summaries[sel].unique()),
            labels=["all"] + sorted(count_summaries[sel].unique()),
            name=sel,
        ),
    )
    for sel in ["library", "pre_selection_date", "post_selection_date"]
]

reduced_selectors = [
    alt.selection_point(
        fields=[sel],
        bind=alt.binding_select(
            options=[None] + sorted(count_summaries[sel].unique()),
            labels=["all"] + sorted(count_summaries[sel].unique()),
            name=sel,
        ),
    )
    for sel in ["library"]
]

# classify variants
func_scores = func_scores.pipe(
    dms_variants.codonvarianttable.CodonVariantTable.classifyVariants
)


def ridgeplot(df):
    variant_classes = list(
        reversed(
            [
                c
                for c in [
                    "wildtype",
                    "synonymous",
                    "1 nonsynonymous",
                    ">1 nonsynonymous",
                    "stop",
                ]
                if c in set(df["variant_class"])
            ]
        )
    )

    # remove deletion variants
    df = df.loc[df["variant_class"] != "deletion"]

    assert set(df["variant_class"]) == set(variant_classes)

    # get smoothed distribution of functional scores
    bins = numpy.linspace(
        df["func_score"].min(),
        df["func_score"].max(),
        num=50,
    )
    smoothed_dist = pd.concat(
        [
            pd.DataFrame(
                {
                    "selection": sel,
                    "variant_class": var,
                    "func_score": bins,
                    "count": scipy.stats.gaussian_kde(df["func_score"])(bins),
                    "mean_func_score": df["func_score"].mean(),
                    "number of variants": len(df),
                }
            )
            for (sel, var), df in df.groupby(["selection", "variant_class"])
        ]
    ).merge(
        count_summaries[
            ["selection", "library", "pre_selection_date", "post_selection_date"]
        ],
        on="selection",
        validate="many_to_one",
    )

    # assign y / y2 for plotting
    facet_overlap = 0.7  # maximal facet overlap
    max_count = (smoothed_dist["count"]).max()
    smoothed_dist = smoothed_dist.assign(
        y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
        y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
    )

    # ridgeline plot, based on this but using y / y2 rather than row:
    # https://altair-viz.github.io/gallery/ridgeline_plot.html
    ridgeline_chart = (
        alt.Chart(smoothed_dist)
        .encode(
            x=alt.X(
                "func_score", title="variant entry score", scale=alt.Scale(nice=False)
            ),
            y=alt.Y(
                "y",
                scale=alt.Scale(nice=False),
                title=None,
                axis=alt.Axis(
                    ticks=False,
                    domain=False,
                    # set manual labels https://stackoverflow.com/a/64106056
                    values=[v + 0.5 for v in range(len(variant_classes))],
                    labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
                ),
            ),
            y2=alt.Y2("y2"),
            fill=alt.Fill(
                "mean_func_score:Q",
                title="mean variant entry score",
                legend=alt.Legend(direction="horizontal"),
                scale=alt.Scale(
                    domainMid=0,
                    range=orangeblue, 
                    type="linear",
                    domain=[-6,1],
                ),
            ),
            facet=alt.Facet(
                "selection",
                columns=4,
                title=None,
                header=alt.Header(
                    labelFontWeight="bold",
                    labelPadding=0,
                ),
            ),
            tooltip=[
                "selection",
                "variant_class",
                alt.Tooltip(
                    "mean_func_score", format=".2f", title="mean variant entry score"
                ),
                "number of variants",
                "library",
                "pre_selection_date",
                "post_selection_date",
            ],
        )
        .mark_area(
            interpolate="monotone",
            smooth=True,
            fillOpacity=0.8,
            stroke="black",
            strokeWidth=0.5,
        )
        .configure_view(stroke=None)
        .configure_axis(grid=False)
        .properties(width=180, height=22 * len(variant_classes))
    )

    for sel in selectors:
        ridgeline_chart = ridgeline_chart.add_params(sel).transform_filter(sel)

    ridgeline_chart = ridgeline_chart.properties(
        autosize=alt.AutoSizeParams(resize=True),
    )

    return ridgeline_chart


ridgeplot(func_scores)

def library_average_ridgeplot(df):
    variant_classes = list(
        reversed(
            [
                c
                for c in [
                    "wildtype",
                    "synonymous",
                    "1 nonsynonymous",
                    ">1 nonsynonymous",
                    "stop",
                ]
                if c in set(df["variant_class"])
            ]
        )
    )

    # remove deletions
    df = df.loc[df["variant_class"] != "deletion"]

    assert set(df["variant_class"]) == set(variant_classes)

    # get smoothed distribution of functional scores
    bins = numpy.linspace(
        df["func_score"].min(),
        df["func_score"].max(),
        num=50,
    )
    smoothed_dist = pd.concat(
        [
            pd.DataFrame(
                {
                    "library": lib,
                    "variant_class": var,
                    "func_score": bins,
                    "count": scipy.stats.gaussian_kde(df["func_score"])(bins),
                    "mean_func_score": df["func_score"].mean(),
                    "number of variants": len(df),
                    "number of selections": 4, # four selections per library
                }
            )
            for (lib, var), df in df.groupby(["library", "variant_class"])
        ]
    )

    # assign y / y2 for plotting
    facet_overlap = 0.35  # maximal facet overlap
    max_count = (smoothed_dist["count"]).max()
    smoothed_dist = smoothed_dist.assign(
        y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
        y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
    )

    # ridgeline plot, based on this but using y / y2 rather than row:
    # https://altair-viz.github.io/gallery/ridgeline_plot.html
    ridgeline_chart = (
        alt.Chart(smoothed_dist)
        .encode(
            x=alt.X(
                "func_score", 
                title="variant entry score", 
                scale=alt.Scale(nice=False),
                axis=alt.Axis(
                    labelFontSize=12,
                    titleFontSize=12,
                    values=[-4,-2,0,2],
                    labelFontWeight="normal",
                    titleFontWeight="normal",
                )
            ),
            y=alt.Y(
                "y",
                scale=alt.Scale(nice=False),
                title=None,
                axis=alt.Axis(
                    ticks=False,
                    domain=False,
                    # set manual labels https://stackoverflow.com/a/64106056
                    values=[v + 0.5 for v in range(len(variant_classes))],
                    labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
                    labelFontSize=12,
                    labelFontWeight="normal",
                ),
            ),
            y2=alt.Y2("y2"),
            fill=alt.Fill(
                "mean_func_score:Q",
                title=["mean", "variant", "entry score"],
                legend=alt.Legend(
                    direction="vertical",
                    titleFontSize=12,
                    labelFontSize=12,
                    gradientLength=150,
                    # gradientThickness=10,
                    gradientStrokeColor="black",
                    gradientStrokeWidth=0.5,
                    labelFontWeight="normal",
                    titleFontWeight="normal",
                ),
                scale=alt.Scale(
                    domainMid=0,
                    range=orangeblue, 
                    type="linear",
                    domain=[-4.2, 1]
                ),
            ),
            facet=alt.Facet(
                "library",
                columns=4,
                title=None,
                header=alt.Header(
                    labelFontWeight="bold",
                    labelPadding=0,
                    labelFontSize=16,
                ),
            ),
            tooltip=[
                "library",
                "variant_class",
                alt.Tooltip(
                    "mean_func_score", format=".2f", title="mean variant entry score"
                ),
                "number of selections",
                "number of variants",
            ],
        )
        .mark_area(
            interpolate="monotone",
            smooth=True,
            fillOpacity=0.8,
            stroke="black",
            strokeWidth=0.5,
        )
        .configure_view(stroke=None)
        .configure_axis(grid=False)
        .properties(
            width=240, 
            height=80 * len(variant_classes), 
            title="Variant entry score distributions by variant type",
        )
        .configure_title(fontSize=24)
    )

    for sel in reduced_selectors:
        ridgeline_chart = ridgeline_chart.add_params(sel).transform_filter(sel)

    ridgeline_chart = ridgeline_chart.properties(
        autosize=alt.AutoSizeParams(resize=True),
    )

    # Make output dir if doesn't exist
    if not os.path.exists(html_dir):
        os.mkdir(html_dir)

    print(f"Saving to {html_output}")
    ridgeline_chart.save(html_output)

    return ridgeline_chart

# Merge func_scores and count summaries for library info
merged_df = (
    func_scores.merge(
        count_summaries,
        on="selection",
        validate="many_to_one",
    )
)

# Clip functional scores at median of stop codons
lower_floor = merged_df.loc[merged_df["variant_class"] == "stop"]["func_score"].median()
print(f"functional scores are clipped on the lower end at {lower_floor} (median of stop codons) and on the upper end at 2")
merged_df["func_score"] = merged_df["func_score"].clip(lower=lower_floor, upper=2)

library_average_ridgeplot(merged_df)

functional scores are clipped on the lower end at -4.698 (median of stop codons) and on the upper end at 2

Saving to results/averaged_func_scores_ridgeplot/averaged_func_scores_ridgeplot.html

def library_average_ridgeplot(df):
    variant_classes = list(
        reversed(
            [
                c
                for c in [
                    "wildtype",
                    "synonymous",
                    "1 nonsynonymous",
                    ">1 nonsynonymous",
                    "stop",
                ]
                if c in set(df["variant_class"])
            ]
        )
    )

    # remove deletions
    df = df.loc[df["variant_class"] != "deletion"]

    assert set(df["variant_class"]) == set(variant_classes)

    # get smoothed distribution of functional scores
    bins = numpy.linspace(
        df["func_score"].min(),
        df["func_score"].max(),
        num=50,
    )
    smoothed_dist = pd.concat(
        [
            pd.DataFrame(
                {
                    "library": lib,
                    "variant_class": var,
                    "func_score": bins,
                    "count": scipy.stats.gaussian_kde(df["func_score"])(bins),
                    "mean_func_score": df["func_score"].mean(),
                    "number of variants": len(df),
                    "number of selections": 4, # four selections per library
                }
            )
            for (lib, var), df in df.groupby(["library", "variant_class"])
        ]
    )

    # assign y / y2 for plotting
    facet_overlap = 0.35  # maximal facet overlap
    max_count = (smoothed_dist["count"]).max()
    smoothed_dist = smoothed_dist.assign(
        y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
        y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
    )

    # ridgeline plot, based on this but using y / y2 rather than row:
    # https://altair-viz.github.io/gallery/ridgeline_plot.html
    ridgeline_chart = (
        alt.Chart(smoothed_dist)
        .encode(
            x=alt.X(
                "func_score", 
                title="variant entry score", 
                scale=alt.Scale(nice=False),
                axis=alt.Axis(
                    labelFontSize=8,
                    titleFontSize=8,
                    values=[-4,-2,0,2],
                    labelFontWeight="normal",
                    titleFontWeight="normal",
                )
            ),
            y=alt.Y(
                "y",
                scale=alt.Scale(nice=False),
                title=None,
                axis=alt.Axis(
                    ticks=False,
                    domain=False,
                    # set manual labels https://stackoverflow.com/a/64106056
                    values=[v + 0.5 for v in range(len(variant_classes))],
                    labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
                    labelFontSize=8,
                    labelFontWeight="normal",
                    titleFontWeight="normal",
                ),
            ),
            y2=alt.Y2("y2"),
            fill=alt.Fill(
                "mean_func_score:Q",
                title=["mean", "variant", "entry score"],
                legend=alt.Legend(
                    direction="vertical",
                    titleFontSize=8,
                    labelFontSize=8,
                    gradientLength=60,
                    gradientThickness=10,
                    gradientStrokeColor="black",
                    gradientStrokeWidth=0.5,
                    labelFontWeight="normal",
                    titleFontWeight="normal",
                ),
                scale=alt.Scale(
                    domainMid=0,
                    range=orangeblue, 
                    type="linear",
                    domain=[-4.2, 1]
                ),
            ),
            facet=alt.Facet(
                "library",
                columns=4,
                title=None,
                header=alt.Header(
                    labelFontWeight="bold",
                    labelPadding=0,
                    labelFontSize=8,
                ),
                spacing=5,
            ),
            tooltip=[
                "library",
                "variant_class",
                alt.Tooltip(
                    "mean_func_score", format=".2f", title="mean variant entry score"
                ),
                "number of selections",
                "number of variants",
            ],
        )
        .mark_area(
            interpolate="monotone",
            smooth=True,
            fillOpacity=0.8,
            stroke="black",
            strokeWidth=0.5,
        )
        .configure_view(stroke=None)
        .configure_axis(grid=False)
        .properties(width=60, height=20 * len(variant_classes))
    )

    for sel in reduced_selectors:
        ridgeline_chart = ridgeline_chart.add_params(sel).transform_filter(sel)

    ridgeline_chart = ridgeline_chart.properties(
        autosize=alt.AutoSizeParams(resize=True),
    )

    return ridgeline_chart

# Merge func_scores and count summaries for library info
merged_df = (
    func_scores.merge(
        count_summaries,
        on="selection",
        validate="many_to_one",
    )
)

# Clip functional scores at median of stop codons
lower_floor = merged_df.loc[merged_df["variant_class"] == "stop"]["func_score"].median()
print(f"functional scores are clipped on the lower end at {lower_floor} (median of stop codons) and on the upper end at 2")
merged_df["func_score"] = merged_df["func_score"].clip(lower=lower_floor, upper=2)

library_average_ridgeplot(merged_df)

functional scores are clipped on the lower end at -4.698 (median of stop codons) and on the upper end at 2

Visualize functional score distributions for selections used for final analysis¶

Distributions of functional scores¶