Build PacBio consensus sequences¶
This notebook builds consensus sequences for each barcode from the PacBio CCS sequencing.
Import Python modules:
import itertools
import os
import Bio.SeqIO
import alignparse.consensus
import alignparse.utils
import altair as alt
import dms_variants.barcodes
import numpy
import pandas as pd
import scipy.stats
import yaml
_ = alt.data_transformers.disable_max_rows()
Get configuration information:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.
with open("config.yaml") as f:
config = yaml.safe_load(f)
Get the aligned CCSs:
aligned_ccs = pd.read_csv(
"results/process_ccs/CCSs_aligned_to_amplicon.csv",
na_filter=None,
)
Get the gene sequence:
geneseq = str(Bio.SeqIO.read(config["gene_sequence_codon"], "fasta").seq)
print(f"Read gene of length {len(geneseq)} nucleotides")
Read gene of length 3747 nucleotides
Identify strand exchange and filter CCSs known to have it¶
We may have variant tags at the end of eac read which allow us to quantify strand exchange and filter out some strand exchange (that between variants with different tags):
variant_tags = config["variant_tags"]
if not variant_tags:
print("No variant tags specified, so cannot look for strand exchange")
else:
display(pd.DataFrame(variant_tags))
# invert dictionary so tag identities given by nucleotide identity
variant_tags_by_nt = {
tag: {val: key for key, val in vals.items()}
for tag, vals in variant_tags.items()
}
assert all(
len(d1) == len(d2)
for d1, d2 in zip(variant_tags.items(), variant_tags_by_nt.items())
)
# function to assign strand exchange status
def classify_strand_exchange(row):
tags = list({row[tag] for tag in variant_tags})
if len(tags) == 1:
return tags[0]
elif "wildtype" in tags:
return "partially wildtype"
elif "invalid" in tags:
return "invalid nucleotide"
else:
return "strand exchange"
# assign strand exchange status
for tag, d in variant_tags_by_nt.items():
aligned_ccs[tag] = aligned_ccs[f"{tag}_sequence"].map(d).fillna("invalid")
aligned_ccs["strand_exchange"] = aligned_ccs.apply(classify_strand_exchange, axis=1)
# get summary stats
strand_exchange_stats = (
aligned_ccs.groupby(["pacbioRun", "strand_exchange"], as_index=False)
.aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
.assign(
fraction=lambda x: x["n_CCSs"]
/ x.groupby("pacbioRun")["n_CCSs"].transform("sum")
)
)
# plot summary stats
strand_exchange_chart = (
alt.Chart(strand_exchange_stats)
.encode(
x="n_CCSs:Q",
y=alt.Y(
"strand_exchange:N",
axis=alt.Axis(title=None),
),
facet=alt.Facet("pacbioRun", columns=2, title=None),
tooltip=strand_exchange_stats.columns.tolist(),
)
.mark_bar()
.properties(width=250, height=100)
.resolve_scale(x="independent")
)
display(strand_exchange_chart)
# Filter out CCSs with strand exchange. Note that this approach is only
# expected to catch ~half of CCSs with strand exchange.
print(f"There are {len(aligned_ccs)} before filtering.")
aligned_ccs = aligned_ccs.query('strand_exchange != "strand exchange"')
print(f"There are {len(aligned_ccs)} after filtering.")
variant_tag5 | variant_tag3 | |
---|---|---|
variant_1 | G | G |
variant_2 | C | C |
wildtype | A | A |
There are 2192795 before filtering. There are 2174477 after filtering.
Filter CCSs for accuracy¶
Plot the gene and barcode accuracy for each CCS, and only keep those above an accuracy threshold:
max_error_rate = config["max_ccs_error_rate"] # require error rate <= this to keep CCS
log10_error_floor = -7 # error rates less than this set to this for plotting
log10_error_ceil = -2 # error rates greater than this set to this for plotting
nbins = 100 # bins for cumulative fraction plot
# calculate error rates
aligned_ccs = aligned_ccs.assign(
gene_error=lambda x: 1 - x["gene_accuracy"],
barcode_error=lambda x: 1 - x["barcode_accuracy"],
)
# calculate cumulative frequencies on log error rates
cumfrac = (
aligned_ccs.melt(
id_vars=["query_name", "pacbioRun"],
value_vars=["barcode_error", "gene_error"],
value_name="error",
)
.assign(log10_error=lambda x: numpy.log10(x["error"]).clip(lower=log10_error_floor))
.groupby(["variable", "pacbioRun"])
.apply(
lambda g: pd.DataFrame(
{
"cumulative_count": scipy.stats.cumfreq(
g["log10_error"],
numbins=nbins,
defaultreallimits=(log10_error_floor, log10_error_ceil),
)[0],
"log10_error": numpy.linspace(
log10_error_floor, log10_error_ceil, nbins
),
}
)
)
.assign(
meets_accuracy_cutoff=lambda x: x["log10_error"] <= numpy.log10(max_error_rate),
cumulative_fraction=lambda x: x["cumulative_count"] / len(aligned_ccs),
)
.reset_index()
)
# plot cumulative frequencies
cumfrac_chart = (
alt.Chart(cumfrac)
.encode(
x=alt.X(
"log10_error",
scale=alt.Scale(zero=False),
),
y="cumulative_count",
color="meets_accuracy_cutoff",
column=alt.Column("variable", title=None),
row=alt.Row("pacbioRun", title=None),
tooltip=[
alt.Tooltip("log10_error", format=".3f"),
alt.Tooltip("cumulative_fraction", format=".3f"),
alt.Tooltip("cumulative_count", format=".4g"),
],
)
.mark_point(filled=True, size=30)
.properties(width=225, height=120)
)
display(cumfrac_chart)
Remove CCSs that do not meet accuracy threshold:
print(f"There are {len(aligned_ccs)} CCSs before accuracy filtering.")
aligned_ccs = aligned_ccs.query("barcode_error <= @max_error_rate").query(
"gene_error <= @max_error_rate"
)
print(f"After filtering {len(aligned_ccs)} CCSs remain.")
There are 2174477 CCSs before accuracy filtering. After filtering 1047036 CCSs remain.
Convert in-frame deletions to substitutions¶
If a deletion is in-frame, convert to substitution format using -
as the substitution character:
deltosubs = alignparse.utils.InFrameDeletionsToSubs(geneseq)
aligned_ccs = (
aligned_ccs.assign(
mutations_inframe=lambda x: x["gene_mutations"].map(deltosubs.dels_to_subs),
inframe_deletion=lambda x: x["gene_mutations"] != x["mutations_inframe"],
)
.drop(columns="gene_mutations")
.rename(columns={"mutations_inframe": "gene_mutations"})
)
How many CCSs have in-frame deletions?
print("Number of CCSs with in-frame deletions:")
display(
aligned_ccs.groupby("inframe_deletion")
.aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
.assign(fraction=lambda x: x["n_CCSs"] / x["n_CCSs"].sum())
.round(3)
)
Number of CCSs with in-frame deletions:
n_CCSs | fraction | |
---|---|---|
inframe_deletion | ||
False | 958971 | 0.916 |
True | 88065 | 0.084 |
Empirical accuracy of CCSs¶
We can compute the empirical accuracy of individual CCSs by comparing mutations found in CCSs with the same barcode.
First, annotate which CCSs have indels (not in frame):
aligned_ccs = alignparse.consensus.add_mut_info_cols(
aligned_ccs,
mutation_col="gene_mutations",
n_indel_col="n_indels",
overwrite_cols=True,
).assign(has_indel=lambda x: x["n_indels"] > 0)
Now compute empirical accuracy, for all sequences and only those without indels, and at the error-rate filter we chose to use and one 10-fold higher. Note that the in-frame codon deletions are not classified as deletions since we have re-classified as substitutions above. The empirical accuracy is the estimated accuracy of each individual CCS:
# compute empirical accuracies
empirical_acc = []
for no_indel, acc_10x in itertools.product([True, False], [True, False]):
df = aligned_ccs.copy()
if not no_indel:
df = df.query("n_indels == 0")
label = "no indels"
else:
label = "allow indels"
if acc_10x:
max_error_rate_10x = max_error_rate / 10
df = df.query("barcode_error <= @max_error_rate_10x").query(
"gene_error <= @max_error_rate_10x"
)
label += ", 10x accuracy"
for pacbioRun, run_df in df.groupby("pacbioRun"):
empirical_acc.append(
alignparse.consensus.empirical_accuracy(
run_df, upstream_group_cols=None, mutation_col="gene_mutations"
).assign(label=label, pacbioRun=pacbioRun)
)
empirical_acc = pd.concat(empirical_acc, ignore_index=True)
# plot empirical accuracy
print("Empirical accuracies:")
empirical_acc_chart = (
alt.Chart(empirical_acc)
.encode(
x=alt.X(
"accuracy:Q",
scale=alt.Scale(domain=(0, 1)),
),
y=alt.Y(
"label:N",
axis=alt.Axis(title=None),
),
facet=alt.Facet(
"pacbioRun",
title=None,
columns=2,
),
tooltip=[alt.Tooltip("accuracy", format=".3f")],
)
.mark_point(filled=True, size=75)
.properties(width=225, height=65)
)
display(empirical_acc_chart)
Empirical accuracies:
Rarefaction and other estimates of library diversity¶
To help understand how many barcodes there are and how evenly the number of reads are distributed among these barcodes, we make rarefaction curves. We also compute a few other metrics of diversity:
- number of barcodes observed >n times up to the minimum number of CCSs required to call a consensus below.
- inverse Simpson index, which is reciprocal of probability two randomly drawn sequences have same barcode.
# required CCSs per barcode to call consensus below
min_support = config["consensus_params"]["min_support"]
# number of counts for each library / barcode
barcodecounts = aligned_ccs.groupby(["library", "barcode"], as_index=False).aggregate(
count=pd.NamedAgg("query_name", "count")
)
# make rarefaction plot
rarefy_df = pd.concat(
[
(
dms_variants.barcodes.rarefyBarcodes(
df, maxpoints=1000, logspace=False
).assign(library=library)
)
for library, df in barcodecounts.groupby("library")
]
)
rarefy_chart = (
alt.Chart(rarefy_df)
.encode(
x=alt.X(
"ncounts",
title="number of CCSs",
),
y=alt.X(
"nbarcodes",
title="number of barcodes",
),
color=alt.Color("library"),
tooltip=rarefy_df.columns.tolist(),
)
.mark_point(size=10, filled=True)
.resolve_scale(y="independent")
.properties(height=175, width=300)
)
display(rarefy_chart)
# compute diversity statistics
diversity = dms_variants.barcodes.inverse_simpson_index(barcodecounts).melt(
id_vars="library",
value_vars="inverse_simpson_index",
var_name="metric",
value_name="diversity",
)
for n in range(min_support):
diversity = pd.concat(
[
diversity,
barcodecounts.query("count > @n")
.groupby("library", as_index=False)
.aggregate(diversity=pd.NamedAgg("barcode", "nunique"))
.assign(metric=f"barcodes with >{n} CCSs"),
],
ignore_index=True,
)
diversity_chart = (
alt.Chart(diversity)
.encode(
x=alt.X("diversity"),
y=alt.Y("metric", title=None),
facet=alt.Facet("library", columns=2, title=None),
tooltip=diversity.columns.tolist(),
)
.mark_bar()
.properties(width=200, height=15 * (min_support + 1))
)
display(diversity_chart)
Build consensus sequences¶
Use the alignparse.consensus.simple_mutconsensus method to build consensus sequences, and plot how many barcodes and CCSs contributed to valid consensuses or had to be dropped. Note the stats for barcodes and CCSs look different, because there are uneven numbers of CCSs per barcode:
# get parameters for building consensus
consensus_params = config["consensus_params"]
print(
"Building consensus sequences with following settings\n "
+ "\n ".join(f"{param}={val}" for param, val in consensus_params.items())
)
# build consensus sequences and plot results
max_plot_nseqs = 15 # group nseqs >= this together
plot_width = 225
consensus, dropped = alignparse.consensus.simple_mutconsensus(
aligned_ccs, mutation_col="gene_mutations", **consensus_params
)
consensus_stats = (
pd.concat(
[
consensus.rename(columns={"variant_call_support": "nseqs"})
.drop(columns="gene_mutations")
.assign(drop_reason="retained", dropped=False),
dropped.assign(dropped=True),
]
)
.assign(nseqs=lambda x: x["nseqs"].clip(upper=max_plot_nseqs))
.groupby(["library", "drop_reason", "dropped", "nseqs"], as_index=False)
.aggregate(
n_barcodes=pd.NamedAgg("barcode", "count"),
n_CCSs=pd.NamedAgg("nseqs", "sum"),
)
.rename(columns={"drop_reason": "category"})
.melt(
id_vars=["library", "category", "dropped", "nseqs"],
value_vars=["n_barcodes", "n_CCSs"],
var_name="type_of_count",
value_name="count",
)
)
# get drop reasons in order to plot
drop_reasons = (
consensus_stats.groupby(["category", "dropped"], as_index=False)
.aggregate({"count": "sum"})
.sort_values(["dropped", "count"], ascending=[True, False])["category"]
.tolist()
)
drop_colors = ["blue", "orange", "orangered", "goldenrod", "gold", "darkorange"]
assert len(drop_reasons) <= len(drop_colors)
consensus_stats_chart = (
alt.Chart(consensus_stats)
.encode(
x=alt.X(
"nseqs",
scale=alt.Scale(domain=(1, max_plot_nseqs)),
title="number of CCSs for barcode",
),
y=alt.Y("count", stack=True),
color=alt.Color(
"category",
sort=drop_reasons,
scale=alt.Scale(range=drop_colors),
),
row=alt.Row(
"library",
title=None,
),
column=alt.Column("type_of_count"),
tooltip=consensus_stats.columns.tolist(),
)
.mark_bar(size=0.75 * plot_width / max_plot_nseqs)
.properties(width=plot_width, height=120)
.configure_axis(grid=False)
.resolve_scale(y="independent")
)
display(consensus_stats_chart)
Building consensus sequences with following settings max_sub_diffs=None max_indel_diffs=None max_minor_sub_frac=0.2 max_minor_indel_frac=0.2 min_support=2
Write consensus sequences to barcode-variant lookup tables for subsequent use¶
We filter any consensus sequences with indels that are not in-frame codon-length deletions (so filtering out-of-frame indels), and then write the remaining sequences to barcode-variant lookup tables for later use:
# annotate with information on mutation types
consensus = alignparse.consensus.add_mut_info_cols(
consensus,
mutation_col="gene_mutations",
sub_str_col="substitutions",
n_indel_col="n_indels",
)
# plot summary stats
stats = (
consensus.assign(
out_of_frame_indel=lambda x: x["n_indels"] > 0,
in_frame_codon_deletion=lambda x: (
(~x["out_of_frame_indel"]) & (x["substitutions"].str.contains("-"))
),
no_indel=lambda x: (~x["out_of_frame_indel"]) & (~x["in_frame_codon_deletion"]),
)[["library", "out_of_frame_indel", "in_frame_codon_deletion", "no_indel"]]
.groupby("library", as_index=False)
.aggregate("sum")
.melt(
id_vars="library",
var_name="category",
value_name="n_barcodes",
)
.assign(retained=lambda x: x["category"] != "out_of_frame_indel")
)
stats_chart = (
alt.Chart(stats)
.encode(
x=alt.X("n_barcodes"),
y=alt.Y("category", title=None),
color=alt.Color("retained"),
tooltip=stats.columns.tolist(),
facet=alt.Facet("library", columns=2),
)
.mark_bar()
.properties(width=175, height=45)
)
display(stats_chart)
# write to file
consensus = consensus.query("n_indels == 0")
nt_variants = "results/variants/nt_variants.csv"
os.makedirs(os.path.dirname(nt_variants), exist_ok=True)
print(f"Writing {len(df)} consensus sequences to {nt_variants}")
_ = consensus.to_csv(
consensus[["library", "barcode", "substitutions", "variant_call_support"]].to_csv(
nt_variants,
index=False,
)
)
Writing 831480 consensus sequences to results/variants/nt_variants.csv