Plot average functional scores per variant¶

Plot the functional scores from the functional selections.

First, import Python modules:

In [1]:
import altair as alt

import dms_variants.codonvarianttable
import dms_variants.utils

import numpy

import scipy.stats

import pandas as pd

import yaml
In [2]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

Read configuration:

In [3]:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.

with open("config.yaml") as f:
    config = yaml.safe_load(f)

Read in all the data:

In [4]:
with open(config["func_effects_config"]) as f:
    selections = list(yaml.safe_load(f)["avg_func_effects"]["TZM-bl_entry"]["selections"])

count_summaries = pd.concat(
    [pd.read_csv(f"results/func_scores/{s}_count_summary.csv") for s in selections],
    ignore_index=True,
)

func_scores = pd.concat(
    [
        pd.read_csv(f"results/func_scores/{s}_func_scores.csv").assign(selection=s)
        for s in selections
    ],
    ignore_index=True,
)

Distributions of functional scores¶

Plot the functional scores distribution among retained variants (adequate pre-selection counts).

These are plotted as ridgeplots.

In [5]:
# assign a library label
selections = func_scores['selection'].tolist()
libraries = [selection[0] for selection in selections]
func_scores['library'] = libraries

# classify variants
func_scores = func_scores.pipe(
    dms_variants.codonvarianttable.CodonVariantTable.classifyVariants
)

def ridgeplot(df, library):
    # print out func_scores and figure out if it has library

    df = df.query('library==@library').copy()
    
    variant_classes = list(
        reversed(
            [
                c
                for c in [
                    "wildtype",
                    "synonymous",
                    "1 nonsynonymous",
                    ">1 nonsynonymous",
                    "deletion",
                    "stop",
                ]
                if c in set(df["variant_class"])
            ]
        )
    )
    
    assert set(df["variant_class"]) == set(variant_classes)
    
    
    
    # get smoothed distribution of functional scores
    bins = numpy.linspace(
        df["func_score"].min(),
        df["func_score"].max(),
        num=50,
    )
    smoothed_dist = pd.concat(
        [
            pd.DataFrame(
                {
                    "variant_class": var,
                    "func_score": bins,
                    "count": scipy.stats.gaussian_kde(df.groupby('barcode').agg({'func_score': 'mean'})["func_score"])(bins),
                    "mean_func_score": df.groupby('barcode').agg({'func_score': 'mean'})["func_score"].mean(),
                    "number of variants": len(df.groupby('barcode').agg({'func_score': 'mean'})["func_score"]),
                }
            )
            for (lib, var), df in df.groupby(["library", "variant_class"])
        ]
    )
    
    # assign y / y2 for plotting
    facet_overlap = 0.7  # maximal facet overlap
    max_count = (smoothed_dist["count"]).max()
    smoothed_dist = smoothed_dist.assign(
        y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
        y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
    )
    
    # ridgeline plot, based on this but using y / y2 rather than row:
    # https://altair-viz.github.io/gallery/ridgeline_plot.html
    ridgeline_chart = (
        alt.Chart(smoothed_dist)
        .encode(
            x=alt.X(
                "func_score", title="functional score", scale=alt.Scale(nice=False)
            ),
            y=alt.Y(
                "y",
                scale=alt.Scale(nice=False),
                title=None,
                axis=alt.Axis(
                    ticks=False,
                    domain=False,
                    # set manual labels https://stackoverflow.com/a/64106056
                    values=[v + 0.5 for v in range(len(variant_classes))],
                    labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
                ),
            ),
            y2=alt.Y2("y2"),
            fill=alt.Fill(
                "mean_func_score:Q",
                title="mean functional score",
                legend=alt.Legend(direction="horizontal"),
                scale=alt.Scale(scheme="yellowgreenblue"),
            ),
            tooltip=[
                "variant_class",
                alt.Tooltip(
                    "mean_func_score", format=".2f", title="mean functional score"
                ),
                "number of variants",
            ],
        )
        .mark_area(
            interpolate="monotone",
            smooth=True,
            fillOpacity=0.8,
            stroke="lightgray",
            strokeWidth=0.5,
        )
        .configure_view(stroke=None)
        .configure_axis(grid=False)
        .properties(width=180, height=22 * len(variant_classes))
    )

    
    ridgeline_chart = ridgeline_chart.properties(
        autosize=alt.AutoSizeParams(resize=True),
    )
    return(ridgeline_chart)
In [6]:
ridgeplot(func_scores, 'A')
Out[6]:
In [7]:
ridgeplot(func_scores, 'B')
Out[7]:
In [ ]: