Plot average functional scores per variant¶
Plot the functional scores from the functional selections.
First, import Python modules:
In [1]:
import altair as alt
import dms_variants.codonvarianttable
import dms_variants.utils
import numpy
import scipy.stats
import pandas as pd
import yaml
In [2]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
Read configuration:
In [3]:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.
with open("config.yaml") as f:
config = yaml.safe_load(f)
Read in all the data:
In [4]:
with open(config["func_effects_config"]) as f:
selections = list(yaml.safe_load(f)["avg_func_effects"]["TZM-bl_entry"]["selections"])
count_summaries = pd.concat(
[pd.read_csv(f"results/func_scores/{s}_count_summary.csv") for s in selections],
ignore_index=True,
)
func_scores = pd.concat(
[
pd.read_csv(f"results/func_scores/{s}_func_scores.csv").assign(selection=s)
for s in selections
],
ignore_index=True,
)
Distributions of functional scores¶
Plot the functional scores distribution among retained variants (adequate pre-selection counts).
These are plotted as ridgeplots.
In [5]:
# assign a library label
selections = func_scores['selection'].tolist()
libraries = [selection[0] for selection in selections]
func_scores['library'] = libraries
# classify variants
func_scores = func_scores.pipe(
dms_variants.codonvarianttable.CodonVariantTable.classifyVariants
)
def ridgeplot(df, library):
# print out func_scores and figure out if it has library
df = df.query('library==@library').copy()
variant_classes = list(
reversed(
[
c
for c in [
"wildtype",
"synonymous",
"1 nonsynonymous",
">1 nonsynonymous",
"deletion",
"stop",
]
if c in set(df["variant_class"])
]
)
)
assert set(df["variant_class"]) == set(variant_classes)
# get smoothed distribution of functional scores
bins = numpy.linspace(
df["func_score"].min(),
df["func_score"].max(),
num=50,
)
smoothed_dist = pd.concat(
[
pd.DataFrame(
{
"variant_class": var,
"func_score": bins,
"count": scipy.stats.gaussian_kde(df.groupby('barcode').agg({'func_score': 'mean'})["func_score"])(bins),
"mean_func_score": df.groupby('barcode').agg({'func_score': 'mean'})["func_score"].mean(),
"number of variants": len(df.groupby('barcode').agg({'func_score': 'mean'})["func_score"]),
}
)
for (lib, var), df in df.groupby(["library", "variant_class"])
]
)
# assign y / y2 for plotting
facet_overlap = 0.7 # maximal facet overlap
max_count = (smoothed_dist["count"]).max()
smoothed_dist = smoothed_dist.assign(
y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
)
# ridgeline plot, based on this but using y / y2 rather than row:
# https://altair-viz.github.io/gallery/ridgeline_plot.html
ridgeline_chart = (
alt.Chart(smoothed_dist)
.encode(
x=alt.X(
"func_score", title="functional score", scale=alt.Scale(nice=False)
),
y=alt.Y(
"y",
scale=alt.Scale(nice=False),
title=None,
axis=alt.Axis(
ticks=False,
domain=False,
# set manual labels https://stackoverflow.com/a/64106056
values=[v + 0.5 for v in range(len(variant_classes))],
labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
),
),
y2=alt.Y2("y2"),
fill=alt.Fill(
"mean_func_score:Q",
title="mean functional score",
legend=alt.Legend(direction="horizontal"),
scale=alt.Scale(scheme="yellowgreenblue"),
),
tooltip=[
"variant_class",
alt.Tooltip(
"mean_func_score", format=".2f", title="mean functional score"
),
"number of variants",
],
)
.mark_area(
interpolate="monotone",
smooth=True,
fillOpacity=0.8,
stroke="lightgray",
strokeWidth=0.5,
)
.configure_view(stroke=None)
.configure_axis(grid=False)
.properties(width=180, height=22 * len(variant_classes))
)
ridgeline_chart = ridgeline_chart.properties(
autosize=alt.AutoSizeParams(resize=True),
)
return(ridgeline_chart)
In [6]:
ridgeplot(func_scores, 'A')
Out[6]:
In [7]:
ridgeplot(func_scores, 'B')
Out[7]:
In [ ]: