Analyze the functional scoresΒΆ
Analyze the functional scores from the functional selections.
First, import Python modules:
import altair as alt
import dms_variants.codonvarianttable
import dms_variants.utils
import numpy
import scipy.stats
import pandas as pd
import yaml
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
Read configuration:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.
with open("config.yaml") as f:
config = yaml.safe_load(f)
Read in all the data:
with open(config["func_effects_config"]) as f:
selections = list(yaml.safe_load(f)["func_scores"])
count_summaries = pd.concat(
[pd.read_csv(f"results/func_scores/{s}_count_summary.csv") for s in selections],
ignore_index=True,
)
func_scores = pd.concat(
[
pd.read_csv(f"results/func_scores/{s}_func_scores.csv").assign(selection=s)
for s in selections
],
ignore_index=True,
)
Pre- and post-selection counts for variantsΒΆ
Plot the pre-selection and post-selection sample count distributions for the variants.
The boxes span the first to third quartiles, the black line is the median, and the thin horizontal line spans the full range.
For the pre-selection counts the red line is the minimum counts: anything below that (with fewer pre-selection counts) is not included in the set of functional scores that are calculated.
Note that the x-axis uses a symlog scale.
You can use the options at the bottom to subset to specific samples.
charts = []
for count_type in ["pre", "post"]:
base_chart = (
alt.Chart(count_summaries)
.encode(
y=alt.Y("selection", title=None, axis=alt.Axis(labels=count_type == "pre")),
tooltip=count_summaries.columns.tolist(),
)
.properties(
width=275,
height=alt.Step(15),
title=f"{count_type}-selection counts",
)
)
quantile_bar = base_chart.encode(
x=alt.X(
f"{count_type}_count_q1",
scale=alt.Scale(type="symlog", constant=20),
axis=alt.Axis(labelOverlap=True),
title=f"{count_type}-selection counts for variant",
),
x2=f"{count_type}_count_q3",
).mark_bar(color="blue", height={"band": 0.8})
range_line = base_chart.encode(
x=f"{count_type}_count_min",
x2=f"{count_type}_count_max",
).mark_rule(color="blue", opacity=0.5)
median_line = base_chart.encode(
x=f"{count_type}_count_median",
x2=f"{count_type}_count_median",
).mark_bar(xOffset=1, x2Offset=-1, color="black", height={"band": 0.8})
if count_type == "pre":
threshold_line = base_chart.encode(
x="min_pre_selection_count",
x2="min_pre_selection_count",
).mark_bar(xOffset=1, x2Offset=-1, color="red", height={"band": 0.8})
charts.append(quantile_bar + range_line + median_line + threshold_line)
else:
charts.append(quantile_bar + range_line + median_line)
count_summary_chart = alt.hconcat(*charts)
selectors = [
alt.selection_point(
fields=[sel],
bind=alt.binding_select(
options=[None] + sorted(count_summaries[sel].unique()),
labels=["all"] + sorted(count_summaries[sel].unique()),
name=sel,
),
)
for sel in ["library", "pre_selection_date", "post_selection_date"]
]
for sel in selectors:
count_summary_chart = count_summary_chart.add_params(sel).transform_filter(sel)
count_summary_chart = count_summary_chart.properties(
autosize=alt.AutoSizeParams(resize=True),
)
count_summary_chart
Distributions of functional scoresΒΆ
Plot the functional scores distribution among retained variants (adequate pre-selection counts).
These are plotted as ridgeplots.
# classify variants
func_scores = func_scores.pipe(
dms_variants.codonvarianttable.CodonVariantTable.classifyVariants
)
def ridgeplot(df):
variant_classes = list(
reversed(
[
c
for c in [
"wildtype",
"synonymous",
"1 nonsynonymous",
">1 nonsynonymous",
"deletion",
"stop",
]
if c in set(df["variant_class"])
]
)
)
assert set(df["variant_class"]) == set(variant_classes)
# get smoothed distribution of functional scores
bins = numpy.linspace(
df["func_score"].min(),
df["func_score"].max(),
num=50,
)
smoothed_dist = pd.concat(
[
pd.DataFrame(
{
"selection": sel,
"variant_class": var,
"func_score": bins,
"count": scipy.stats.gaussian_kde(df["func_score"])(bins),
"mean_func_score": df["func_score"].mean(),
"number of variants": len(df),
}
)
for (sel, var), df in df.groupby(["selection", "variant_class"])
]
).merge(
count_summaries[
["selection", "library", "pre_selection_date", "post_selection_date"]
],
on="selection",
validate="many_to_one",
)
# assign y / y2 for plotting
facet_overlap = 0.7 # maximal facet overlap
max_count = (smoothed_dist["count"]).max()
smoothed_dist = smoothed_dist.assign(
y=lambda x: x["variant_class"].map(lambda v: variant_classes.index(v)),
y2=lambda x: x["y"] + x["count"] / max_count / facet_overlap,
)
# ridgeline plot, based on this but using y / y2 rather than row:
# https://altair-viz.github.io/gallery/ridgeline_plot.html
ridgeline_chart = (
alt.Chart(smoothed_dist)
.encode(
x=alt.X(
"func_score", title="functional score", scale=alt.Scale(nice=False)
),
y=alt.Y(
"y",
scale=alt.Scale(nice=False),
title=None,
axis=alt.Axis(
ticks=False,
domain=False,
# set manual labels https://stackoverflow.com/a/64106056
values=[v + 0.5 for v in range(len(variant_classes))],
labelExpr=f"{str(variant_classes)}[round(datum.value - 0.5)]",
),
),
y2=alt.Y2("y2"),
fill=alt.Fill(
"mean_func_score:Q",
title="mean functional score",
legend=alt.Legend(direction="horizontal"),
scale=alt.Scale(scheme="yellowgreenblue"),
),
facet=alt.Facet(
"selection",
columns=4,
title=None,
header=alt.Header(
labelFontWeight="bold",
labelPadding=0,
),
),
tooltip=[
"selection",
"variant_class",
alt.Tooltip(
"mean_func_score", format=".2f", title="mean functional score"
),
"number of variants",
"library",
"pre_selection_date",
"post_selection_date",
],
)
.mark_area(
interpolate="monotone",
smooth=True,
fillOpacity=0.8,
stroke="lightgray",
strokeWidth=0.5,
)
.configure_view(stroke=None)
.configure_axis(grid=False)
.properties(width=180, height=22 * len(variant_classes))
)
for sel in selectors:
ridgeline_chart = ridgeline_chart.add_params(sel).transform_filter(sel)
ridgeline_chart = ridgeline_chart.properties(
autosize=alt.AutoSizeParams(resize=True),
)
return ridgeline_chart
ridgeplot(func_scores)
Correlations in variant functional scoresΒΆ
Correlations among functional scores of variants with the same barcode in the same library:
pre_selection_dates = count_summaries.set_index("selection")[
"pre_selection_date"
].to_dict()
post_selection_dates = count_summaries.set_index("selection")[
"post_selection_date"
].to_dict()
corrs = (
dms_variants.utils.tidy_to_corr(
df=func_scores.merge(
count_summaries[["selection", "library"]],
on="selection",
validate="many_to_one",
),
sample_col="selection",
label_col="barcode",
value_col="func_score",
group_cols="library",
)
.rename(columns={"correlation": "r"})
.assign(
r2=lambda x: x["r"] ** 2,
pre_date_1=lambda x: x["selection_1"].map(pre_selection_dates),
pre_date_2=lambda x: x["selection_2"].map(pre_selection_dates),
pre_selection_date=lambda x: x["pre_date_1"].where(
x["pre_date_1"] == x["pre_date_2"], pd.NA
),
post_date_1=lambda x: x["selection_1"].map(post_selection_dates),
post_date_2=lambda x: x["selection_2"].map(post_selection_dates),
post_selection_date=lambda x: x["post_date_1"].where(
x["post_date_1"] == x["post_date_2"], pd.NA
),
)
.drop(columns=["pre_date_1", "pre_date_2", "post_date_1", "post_date_2"])
)
for library, library_corr in corrs.groupby("library"):
corr_chart = (
alt.Chart(library_corr)
.encode(
alt.X("selection_1", title=None),
alt.Y("selection_2", title=None),
color=alt.Color("r2", scale=alt.Scale(zero=True)),
tooltip=[
alt.Tooltip(c, format=".3g") if c in ["r", "r2"] else c
for c in corrs.columns
],
)
.mark_rect(stroke="black")
.properties(width=alt.Step(15), height=alt.Step(15), title=library)
.configure_axis(labelLimit=500)
)
for sel in selectors:
corr_chart = corr_chart.add_params(sel).transform_filter(sel)
corr_chart = corr_chart.properties(autosize=alt.AutoSizeParams(resize=True))
display(corr_chart)