Analyze the variant counts¶
Analyze the counts of each variant in each sample as determined from the barcode sequencing.
First, import Python modules:
import altair as alt
import Bio.SeqIO
import dms_variants.codonvarianttable
import numpy
import pandas as pd
import yaml
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
Read configuration:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.
with open("config.yaml") as f:
config = yaml.safe_load(f)
Read in all the data:
# read the barcode runs and get the samples
barcode_runs = (
pd.read_csv(config["barcode_runs"], parse_dates=["date"])[
["sample", "library", "date"]
]
.sort_values(["date", "sample"])
.assign(date=lambda x: x["date"].dt.strftime("%Y-%m-%d"))
)
samples = barcode_runs["sample"].tolist()
assert len(samples) == len(set(samples))
# read the barcode counts, invalid counts, and fates
count_dfs = {}
for count_type in ["counts", "invalid", "fates"]:
count_dfs[count_type] = pd.concat(
[
pd.read_csv(f"results/barcode_counts/{sample}_{count_type}.csv").assign(
sample=sample
)
for sample in samples
]
)
# read the site numbering map
site_numbering_map = pd.read_csv(config["site_numbering_map"])
# create the codon-variant table
variants = dms_variants.codonvarianttable.CodonVariantTable(
barcode_variant_file=config["codon_variants"],
geneseq=str(Bio.SeqIO.read(config["gene_sequence_codon"], "fasta").seq),
allowgaps=True,
substitutions_are_codon=True,
primary_target="gene",
substitutions_col="codon_substitutions",
)
variants.add_sample_counts_df(
count_dfs["counts"].merge(
barcode_runs[["sample", "library"]], on="sample", validate="many_to_one"
)
)
Barcode sequencing stats¶
Plot "fates" of barcode reads: how many properly aligned to valid barcodes.
selections = [
alt.selection_point(
fields=[col],
bind=alt.binding_select(
options=[None] + barcode_runs[col].dropna().unique().tolist(),
labels=["all"] + [str(x) for x in barcode_runs[col].dropna().unique()],
name=col,
),
)
for col in ["library", "date"]
]
fate_chart = (
alt.Chart(count_dfs["fates"])
.transform_lookup(
"sample",
alt.LookupData(barcode_runs, "sample", fields=["date", "library"]),
)
.encode(
x=alt.X(
"count", title="barcode sequencing counts", axis=alt.Axis(format=".2g")
),
y=alt.Y("sample", title=None, sort=samples),
color=alt.Color(
"fate",
scale=alt.Scale(reverse=True),
),
order=alt.Order("fate", sort="descending"),
tooltip=[
alt.Tooltip(c, format=".3g") if c == "count" else c
for c in count_dfs["fates"].columns.tolist() + ["date:N", "library:N"]
],
)
.mark_bar()
.properties(width=350, height=alt.Step(13))
.configure_axis(labelLimit=500)
)
for selection in selections:
fate_chart = fate_chart.add_params(selection).transform_filter(selection)
display(fate_chart)
Average valid and invalid counts per barcode¶
Plot average valid and invalid counts per barcode. Invalid counts mean parseable barcodes that nonetheless don't map to our library's set of valid barcodes:
# merge valid and invalid counts
counts = pd.concat(
[
count_dfs["counts"].assign(valid="valid"),
count_dfs["invalid"].assign(valid="invalid"),
]
)
avg_counts = counts.groupby(["sample", "valid"], as_index=False).aggregate(
avg_count=pd.NamedAgg("count", "mean")
)
valid_selection = alt.selection_point(fields=["valid"], bind="legend")
avg_counts_chart = (
alt.Chart(avg_counts)
.transform_lookup(
"sample",
alt.LookupData(barcode_runs, "sample", fields=["date", "library"]),
)
.encode(
x=alt.X("avg_count", title="average counts per barcode"),
y=alt.Y("sample", title=None, sort=samples),
yOffset="valid",
color=alt.Color(
"valid",
title="valid barcode",
scale=alt.Scale(domain=avg_counts["valid"].unique()),
),
tooltip=[
alt.Tooltip(c, format=".3g") if c == "avg_count" else c
for c in avg_counts.columns.tolist() + ["library:N", "date:N"]
],
)
.mark_bar()
.properties(width=200, height=alt.Step(15, **{"for": "position"}))
.configure_axis(labelLimit=500)
.add_params(*selections, valid_selection)
.transform_filter(valid_selection)
)
for selection in selections:
avg_counts_chart = avg_counts_chart.transform_filter(selection)
avg_counts_chart
Check for abundant invalid barcodes¶
Check for each sample to see if any invalid barcodes in the top most abundant. If so, that is a likely sign of contamination.
invalid_rank = counts.assign(
rank=lambda x: x.groupby("sample")["count"].rank(method="first")
)
invalid_n = 20 # look for invalid in top this many
invalid_top_n_counts = (
invalid_rank.sort_values("rank")
.groupby(["sample", "valid"], as_index=False)
.aggregate(
invalid_in_top_n=pd.NamedAgg("rank", lambda s: len(s[s <= invalid_n])),
top_invalid_rank=pd.NamedAgg("rank", "first"),
top_invalid_barcode=pd.NamedAgg("barcode", "first"),
)
.query("valid == 'invalid'")
)
invalid_top_n_counts_chart = (
alt.Chart(invalid_top_n_counts)
.transform_lookup(
"sample",
alt.LookupData(barcode_runs, "sample", fields=["date", "library"]),
)
.encode(
x=alt.X(
"invalid_in_top_n",
title=f"invalid barcodes in top {invalid_n}",
scale=alt.Scale(domain=(0, invalid_n)),
),
y=alt.Y("sample", title=None, sort=samples),
tooltip=invalid_top_n_counts.columns.tolist(),
)
.mark_bar()
.properties(width=200, height=alt.Step(15))
.configure_axis(labelLimit=500)
.add_params(*selections)
)
for selection in selections:
invalid_top_n_counts_chart = invalid_top_n_counts_chart.transform_filter(selection)
display(invalid_top_n_counts_chart)
print("\nHere are top invalid barcodes:")
invalid_rank.query("valid == 'invalid'").query("rank <= @invalid_n")
Here are top invalid barcodes:
barcode | count | sample | valid | rank |
---|
Counts of each target type¶
We may have different target types in addition to the gene, such as a spike-in or neutralization standard. Plot the average counts per variant of each target type:
avg_counts_by_target = variants.avgCountsPerVariant(libraries=variants.libraries).merge(
barcode_runs, validate="many_to_one"
)
target_selection = alt.selection_point(fields=["target"], bind="legend", value="gene")
avg_counts_by_target_chart = (
alt.Chart(avg_counts_by_target)
.encode(
x=alt.X("avg_counts_per_variant", title="average counts per variant"),
y=alt.Y("sample", title=None, sort=samples),
yOffset="target",
color=alt.Color(
"target",
title="variant type",
scale=alt.Scale(domain=avg_counts_by_target["target"].unique()),
),
tooltip=[
alt.Tooltip(c, format=".3g") if c == "avg_count_per_variant" else c
for c in avg_counts_by_target.columns
],
)
.mark_bar()
.properties(width=200, height=alt.Step(15, **{"for": "position"}))
.configure_axis(labelLimit=500)
.add_params(*selections, target_selection)
.transform_filter(target_selection)
)
for selection in selections:
avg_counts_by_target_chart = avg_counts_by_target_chart.transform_filter(selection)
avg_counts_by_target_chart
Plot fraction of all counts that are to each type of target:
target_fracs = (
variants.variant_count_df.groupby(["sample", "library", "target"])
.aggregate(total_counts=pd.NamedAgg("count", "sum"))
.assign(
frac_counts=lambda x: (
x["total_counts"]
/ x.groupby(["sample", "library"])["total_counts"].transform("sum")
)
)
.drop(columns="total_counts")
.reset_index()
.merge(barcode_runs, validate="many_to_one")
)
target_selection = alt.selection_point(fields=["target"], bind="legend")
target_fracs_chart = (
alt.Chart(target_fracs)
.encode(
x=alt.X("frac_counts", title="fraction of variant counts"),
y=alt.Y("sample", title=None, sort=samples),
color=alt.Color(
"target",
title="variant type",
scale=alt.Scale(domain=target_fracs["target"].unique()),
),
tooltip=[
alt.Tooltip(c, format=".3g") if c == "frac_counts" else c
for c in target_fracs.columns
],
)
.mark_bar()
.properties(width=200, height=alt.Step(12))
.configure_axis(labelLimit=500)
.add_params(*selections, target_selection)
.transform_filter(target_selection)
)
for selection in selections:
target_fracs_chart = target_fracs_chart.transform_filter(selection)
target_fracs_chart
Diversity of barcodes for each sample¶
Get the diversity of barcodes mapping to the gene of interest (not neutralization standards or other non-primary targets) for each sample.
We compute three diversity indices:
- 0D: number ofo unique barcodes observed in that sample
- 1D: effective number of barcodes (Shannon diversity)
- 2D: inverse Simpson index (inverse probability two random barcodes are the same)
All are plotted below, the plot is interactive, so you can select by date library and click the legend just to show on particular diversity measure.
diversity_measures = {
"unique_barcodes_observed": "0D (unique barcodes observed)",
"effective_n_barcodes": "1D (effective barcodes)",
"inverse_simpson_index": "2D (inverse Simpson index)",
}
diversity = (
variants.variant_count_df.query("target == @variants.primary_target")
.query("count > 0")[["sample", "barcode", "count"]]
.groupby("sample", as_index=False)
.aggregate(
unique_barcodes_observed=pd.NamedAgg("barcode", "count"),
effective_n_barcodes=pd.NamedAgg(
"count",
lambda s: numpy.exp(-((s / s.sum()) * numpy.log(s / s.sum())).sum()),
),
inverse_simpson_index=pd.NamedAgg(
"count",
lambda s: 1 / ((s / s.sum()) ** 2).sum(),
),
)
.melt(id_vars="sample", var_name="diversity_measure", value_name="diversity")
.merge(barcode_runs, on="sample", validate="many_to_one")
.assign(diversity_measure=lambda x: x["diversity_measure"].map(diversity_measures))
)
diversity_selection = alt.selection_point(fields=["diversity_measure"], bind="legend")
diversity_chart = (
alt.Chart(diversity)
.encode(
x=alt.X("diversity"),
y=alt.Y("sample", title=None, sort=samples),
yOffset="diversity_measure",
color=alt.Color(
"diversity_measure",
scale=alt.Scale(domain=list(diversity_measures.values())),
),
tooltip=[
alt.Tooltip(c, format=".1f") if diversity[c].dtype == float else c
for c in diversity.columns
],
)
.mark_bar()
.properties(width=200, height=alt.Step(15, **{"for": "position"}))
.configure_axis(labelLimit=500)
)
for selection in selections + [diversity_selection]:
diversity_chart = diversity_chart.add_params(selection).transform_filter(selection)
diversity_chart
Average mutations per variant¶
Plot average number of mutations per variant of each type of codon mutation:
avg_muts = (
variants.numCodonMutsByType(
variant_type="all",
libraries=variants.libraries,
)
.drop(columns=["count", "num_muts_count"])
.merge(barcode_runs, validate="many_to_one")
# remove categorical to enable plotting below
.assign(mutation_type=lambda x: x["mutation_type"].tolist())
.rename(columns={"number": "average mutations"})
)
mut_type_selection = alt.selection_point(fields=["mutation_type"], bind="legend")
mut_types = avg_muts["mutation_type"].unique()
avg_muts_chart = (
alt.Chart(avg_muts)
.encode(
x=alt.X("average mutations", axis=alt.Axis(grid=False)),
y=alt.Y("sample", title=None),
yOffset=alt.YOffset("mutation_type", sort=mut_types),
color=alt.Color("mutation_type", scale=alt.Scale(domain=mut_types)),
tooltip=[
alt.Tooltip(c, format=".3g") if c == "average mutations" else c
for c in avg_muts.columns
],
)
.mark_bar()
.properties(height=alt.Step(20, **{"for": "position"}), width=300)
.configure_axis(labelLimit=500)
.add_params(mut_type_selection, *selections)
.transform_filter(mut_type_selection)
)
for selection in selections:
avg_muts_chart = avg_muts_chart.transform_filter(selection)
avg_muts_chart
Mutation frequencies across gene¶
Plot percent of all variants with any type of amino-acid changing mutation type (nonsynonymous, stop, or deletion) at each site:
# Get site mutation frequencies in wide form to make it as compact as possible.
# Reason is this is a lot of data and storing compactly shrinks plot.
# We also encode the samples as integers because we can't transform_fold them
# if strings containing a dot
sample_coding = {sample: str(i) for (i, sample) in enumerate(samples)}
site_freqs_wide = (
variants.mutCounts(variant_type="all", mut_type="aa", libraries=variants.libraries)
.groupby(["sample", "site"])
.aggregate(mutation_count=pd.NamedAgg("count", "sum"))
.reset_index()
.merge(
variants.n_variants_df(libraries=variants.libraries, primary_target_only=True),
validate="many_to_one",
)
.assign(percent_w_mutation=lambda x: 100 * x["mutation_count"] / x["count"])
.pivot_table(
index="site",
values="percent_w_mutation",
columns="sample",
)
.reset_index()
.rename(columns={"site": "sequential_site"})
.rename(columns=sample_coding)
.assign(wildtype=lambda x: x["sequential_site"].map(variants.aas))
.merge(
site_numbering_map[["sequential_site", "reference_site"]],
validate="one_to_one",
)
)
assert not site_freqs_wide.isnull().any().any()
Make the plot. Note that you can select which libraries or dates to show with the dropdown box at bottom:
site_freqs_chart = (
alt.Chart(site_freqs_wide)
.transform_fold(list(sample_coding.values()), ["sample_coding", "percent"])
.transform_lookup(
lookup="sample_coding",
from_=alt.LookupData(
data=barcode_runs.assign(
sample_coding=lambda x: x["sample"].map(sample_coding)
),
key="sample_coding",
fields=["library", "date", "sample"],
),
)
.encode(
x=alt.X(
"reference_site",
scale=alt.Scale(nice=False, zero=False),
sort=alt.SortField("sequential_site"),
axis=alt.Axis(labelOverlap=True),
),
y=alt.Y("percent:Q", title="% variants w mutation"),
tooltip=[
alt.Tooltip(c, format=".3g") if c in {"percent:Q"} else c
for c in [
"sequential_site",
"reference_site",
"wildtype",
"percent:Q",
"date:N",
]
],
facet=alt.Facet(
"sample:N",
title=None,
header=alt.Header(labelPadding=0),
columns=3,
spacing=5,
),
)
.mark_line(point=True, size=0.5)
.properties(height=100, width=275, autosize=alt.AutoSizeParams(resize=True))
.add_params(*selections)
.resolve_scale(y="independent")
.configure_axis(grid=False)
)
for selection in selections:
site_freqs_chart = site_freqs_chart.transform_filter(selection)
site_freqs_chart