Build PacBio consensus sequences¶

This notebook builds consensus sequences for each barcode from the PacBio CCS sequencing.

Import Python modules:

In [1]:
import itertools
import os

import Bio.SeqIO

import alignparse.consensus
import alignparse.utils

import altair as alt

import dms_variants.barcodes

import numpy

import pandas as pd

import scipy.stats

import yaml
In [2]:
_ = alt.data_transformers.disable_max_rows()

Get configuration information:

In [3]:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.

with open("config.yaml") as f:
    config = yaml.safe_load(f)

Get the aligned CCSs:

In [4]:
aligned_ccs = pd.read_csv(
    "results/process_ccs/CCSs_aligned_to_amplicon.csv",
    na_filter=None,
)

Get the gene sequence:

In [5]:
geneseq = str(Bio.SeqIO.read(config["gene_sequence_codon"], "fasta").seq)

print(f"Read gene of length {len(geneseq)} nucleotides")
Read gene of length 1599 nucleotides

Identify strand exchange and filter CCSs known to have it¶

We may have variant tags at the end of eac read which allow us to quantify strand exchange and filter out some strand exchange (that between variants with different tags):

In [6]:
variant_tags = config["variant_tags"]

if not variant_tags:
    print("No variant tags specified, so cannot look for strand exchange")

else:
    display(pd.DataFrame(variant_tags))

    # invert dictionary so tag identities given by nucleotide identity
    variant_tags_by_nt = {
        tag: {val: key for key, val in vals.items()}
        for tag, vals in variant_tags.items()
    }
    assert all(
        len(d1) == len(d2)
        for d1, d2 in zip(variant_tags.items(), variant_tags_by_nt.items())
    )

    # function to assign strand exchange status
    def classify_strand_exchange(row):
        tags = list({row[tag] for tag in variant_tags})
        if len(tags) == 1:
            return tags[0]
        elif "wildtype" in tags:
            return "partially wildtype"
        elif "invalid" in tags:
            return "invalid nucleotide"
        else:
            return "strand exchange"

    # assign strand exchange status
    for tag, d in variant_tags_by_nt.items():
        aligned_ccs[tag] = aligned_ccs[f"{tag}_sequence"].map(d).fillna("invalid")
    aligned_ccs["strand_exchange"] = aligned_ccs.apply(classify_strand_exchange, axis=1)

    # get summary stats
    strand_exchange_stats = (
        aligned_ccs.groupby(["pacbioRun", "strand_exchange"], as_index=False)
        .aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
        .assign(
            fraction=lambda x: x["n_CCSs"]
            / x.groupby("pacbioRun")["n_CCSs"].transform("sum")
        )
    )

    # plot summary stats
    strand_exchange_chart = (
        alt.Chart(strand_exchange_stats)
        .encode(
            x="n_CCSs:Q",
            y=alt.Y(
                "strand_exchange:N",
                axis=alt.Axis(title=None),
            ),
            facet=alt.Facet("pacbioRun", columns=2, title=None),
            tooltip=strand_exchange_stats.columns.tolist(),
        )
        .mark_bar()
        .properties(width=250, height=100)
        .resolve_scale(x="independent")
    )
    display(strand_exchange_chart)

    # Filter out CCSs with strand exchange. Note that this approach is only
    # expected to catch ~half of CCSs with strand exchange.
    print(f"There are {len(aligned_ccs)} before filtering.")
    aligned_ccs = aligned_ccs.query('strand_exchange != "strand exchange"')
    print(f"There are {len(aligned_ccs)} after filtering.")
variant_tag5 variant_tag3
variant_1 G G
variant_2 C C
wildtype A A
Save as SVGSave as PNGView SourceView Compiled VegaOpen in Vega Editor
There are 3932021 before filtering.
There are 3896812 after filtering.

Filter CCSs for accuracy¶

Plot the gene and barcode accuracy for each CCS, and only keep those above an accuracy threshold:

In [7]:
max_error_rate = config["max_ccs_error_rate"]  # require error rate <= this to keep CCS

log10_error_floor = -7  # error rates less than this set to this for plotting
log10_error_ceil = -2  # error rates greater than this set to this for plotting
nbins = 100  # bins for cumulative fraction plot

# calculate error rates
aligned_ccs = aligned_ccs.assign(
    gene_error=lambda x: 1 - x["gene_accuracy"],
    barcode_error=lambda x: 1 - x["barcode_accuracy"],
)

# calculate cumulative frequencies on log error rates
cumfrac = (
    aligned_ccs.melt(
        id_vars=["query_name", "pacbioRun"],
        value_vars=["barcode_error", "gene_error"],
        value_name="error",
    )
    .assign(log10_error=lambda x: numpy.log10(x["error"]).clip(lower=log10_error_floor))
    .groupby(["variable", "pacbioRun"])
    .apply(
        lambda g: pd.DataFrame(
            {
                "cumulative_count": scipy.stats.cumfreq(
                    g["log10_error"],
                    numbins=nbins,
                    defaultreallimits=(log10_error_floor, log10_error_ceil),
                )[0],
                "log10_error": numpy.linspace(
                    log10_error_floor, log10_error_ceil, nbins
                ),
            }
        )
    )
    .assign(
        meets_accuracy_cutoff=lambda x: x["log10_error"] <= numpy.log10(max_error_rate),
        cumulative_fraction=lambda x: x["cumulative_count"] / len(aligned_ccs),
    )
    .reset_index()
)

# plot cumulative frequencies
cumfrac_chart = (
    alt.Chart(cumfrac)
    .encode(
        x=alt.X(
            "log10_error",
            scale=alt.Scale(zero=False),
        ),
        y="cumulative_count",
        color="meets_accuracy_cutoff",
        column=alt.Column("variable", title=None),
        row=alt.Row("pacbioRun", title=None),
        tooltip=[
            alt.Tooltip("log10_error", format=".3f"),
            alt.Tooltip("cumulative_fraction", format=".3f"),
            alt.Tooltip("cumulative_count", format=".4g"),
        ],
    )
    .mark_point(filled=True, size=30)
    .properties(width=225, height=120)
)
display(cumfrac_chart)
Save as SVGSave as PNGView SourceView Compiled VegaOpen in Vega Editor

Remove CCSs that do not meet accuracy threshold:

In [8]:
print(f"There are {len(aligned_ccs)} CCSs before accuracy filtering.")
aligned_ccs = aligned_ccs.query("barcode_error <= @max_error_rate").query(
    "gene_error <= @max_error_rate"
)
print(f"After filtering {len(aligned_ccs)} CCSs remain.")
There are 3896812 CCSs before accuracy filtering.
After filtering 2877442 CCSs remain.

Convert in-frame deletions to substitutions¶

If a deletion is in-frame, convert to substitution format using - as the substitution character:

In [9]:
deltosubs = alignparse.utils.InFrameDeletionsToSubs(geneseq)

aligned_ccs = (
    aligned_ccs.assign(
        mutations_inframe=lambda x: x["gene_mutations"].map(deltosubs.dels_to_subs),
        inframe_deletion=lambda x: x["gene_mutations"] != x["mutations_inframe"],
    )
    .drop(columns="gene_mutations")
    .rename(columns={"mutations_inframe": "gene_mutations"})
)

How many CCSs have in-frame deletions?

In [10]:
print("Number of CCSs with in-frame deletions:")
display(
    aligned_ccs.groupby("inframe_deletion")
    .aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
    .assign(fraction=lambda x: x["n_CCSs"] / x["n_CCSs"].sum())
    .round(3)
)
Number of CCSs with in-frame deletions:
n_CCSs fraction
inframe_deletion
False 2875417 0.999
True 2025 0.001

Empirical accuracy of CCSs¶

We can compute the empirical accuracy of individual CCSs by comparing mutations found in CCSs with the same barcode.

First, annotate which CCSs have indels (not in frame):

In [11]:
aligned_ccs = alignparse.consensus.add_mut_info_cols(
    aligned_ccs,
    mutation_col="gene_mutations",
    n_indel_col="n_indels",
    overwrite_cols=True,
).assign(has_indel=lambda x: x["n_indels"] > 0)

Now compute empirical accuracy, for all sequences and only those without indels, and at the error-rate filter we chose to use and one 10-fold higher. Note that the in-frame codon deletions are not classified as deletions since we have re-classified as substitutions above. The empirical accuracy is the estimated accuracy of each individual CCS:

In [12]:
# compute empirical accuracies
empirical_acc = []
for no_indel, acc_10x in itertools.product([True, False], [True, False]):
    df = aligned_ccs.copy()
    if not no_indel:
        df = df.query("n_indels == 0")
        label = "no indels"
    else:
        label = "allow indels"
    if acc_10x:
        max_error_rate_10x = max_error_rate / 10
        df = df.query("barcode_error <= @max_error_rate_10x").query(
            "gene_error <= @max_error_rate_10x"
        )
        label += ", 10x accuracy"
    for pacbioRun, run_df in df.groupby("pacbioRun"):
        empirical_acc.append(
            alignparse.consensus.empirical_accuracy(
                run_df, upstream_group_cols=None, mutation_col="gene_mutations"
            ).assign(label=label, pacbioRun=pacbioRun)
        )
empirical_acc = pd.concat(empirical_acc, ignore_index=True)

# plot empirical accuracy
print("Empirical accuracies:")
empirical_acc_chart = (
    alt.Chart(empirical_acc)
    .encode(
        x=alt.X(
            "accuracy:Q",
            scale=alt.Scale(domain=(0, 1)),
        ),
        y=alt.Y(
            "label:N",
            axis=alt.Axis(title=None),
        ),
        facet=alt.Facet(
            "pacbioRun",
            title=None,
            columns=2,
        ),
        tooltip=[alt.Tooltip("accuracy", format=".3f")],
    )
    .mark_point(filled=True, size=75)
    .properties(width=225, height=65)
)
display(empirical_acc_chart)
Empirical accuracies:
Save as SVGSave as PNGView SourceView Compiled VegaOpen in Vega Editor

Rarefaction and other estimates of library diversity¶

To help understand how many barcodes there are and how evenly the number of reads are distributed among these barcodes, we make rarefaction curves. We also compute a few other metrics of diversity:

  1. number of barcodes observed >nn times up to the minimum number of CCSs required to call a consensus below.
  2. inverse Simpson index, which is reciprocal of probability two randomly drawn sequences have same barcode.
In [13]:
# required CCSs per barcode to call consensus below
min_support = config["consensus_params"]["min_support"]


# number of counts for each library / barcode
barcodecounts = aligned_ccs.groupby(["library", "barcode"], as_index=False).aggregate(
    count=pd.NamedAgg("query_name", "count")
)

# make rarefaction plot
rarefy_df = pd.concat(
    [
        (
            dms_variants.barcodes.rarefyBarcodes(
                df, maxpoints=1000, logspace=False
            ).assign(library=library)
        )
        for library, df in barcodecounts.groupby("library")
    ]
)
rarefy_chart = (
    alt.Chart(rarefy_df)
    .encode(
        x=alt.X(
            "ncounts",
            title="number of CCSs",
        ),
        y=alt.X(
            "nbarcodes",
            title="number of barcodes",
        ),
        color=alt.Color("library"),
        tooltip=rarefy_df.columns.tolist(),
    )
    .mark_point(size=10, filled=True)
    .resolve_scale(y="independent")
    .properties(height=175, width=300)
)
display(rarefy_chart)

# compute diversity statistics
diversity = dms_variants.barcodes.inverse_simpson_index(barcodecounts).melt(
    id_vars="library",
    value_vars="inverse_simpson_index",
    var_name="metric",
    value_name="diversity",
)
for n in range(min_support):
    diversity = pd.concat(
        [
            diversity,
            barcodecounts.query("count > @n")
            .groupby("library", as_index=False)
            .aggregate(diversity=pd.NamedAgg("barcode", "nunique"))
            .assign(metric=f"barcodes with >{n} CCSs"),
        ],
        ignore_index=True,
    )
diversity_chart = (
    alt.Chart(diversity)
    .encode(
        x=alt.X("diversity"),
        y=alt.Y("metric", title=None),
        facet=alt.Facet("library", columns=2, title=None),
        tooltip=diversity.columns.tolist(),
    )
    .mark_bar()
    .properties(width=200, height=15 * (min_support + 1))
)
display(diversity_chart)
Save as SVGSave as PNGView SourceView Compiled VegaOpen in Vega Editor
Save as SVGSave as PNGView SourceView Compiled VegaOpen in Vega Editor

Build consensus sequences¶

Use the alignparse.consensus.simple_mutconsensus method to build consensus sequences, and plot how many barcodes and CCSs contributed to valid consensuses or had to be dropped. Note the stats for barcodes and CCSs look different, because there are uneven numbers of CCSs per barcode:

In [14]:
# get parameters for building consensus
consensus_params = config["consensus_params"]
print(
    "Building consensus sequences with following settings\n  "
    + "\n  ".join(f"{param}={val}" for param, val in consensus_params.items())
)

# build consensus sequences and plot results
max_plot_nseqs = 15  # group nseqs >= this together
plot_width = 225

consensus, dropped = alignparse.consensus.simple_mutconsensus(
    aligned_ccs, mutation_col="gene_mutations", **consensus_params
)

consensus_stats = (
    pd.concat(
        [
            consensus.rename(columns={"variant_call_support": "nseqs"})
            .drop(columns="gene_mutations")
            .assign(drop_reason="retained", dropped=False),
            dropped.assign(dropped=True),
        ]
    )
    .assign(nseqs=lambda x: x["nseqs"].clip(upper=max_plot_nseqs))
    .groupby(["library", "drop_reason", "dropped", "nseqs"], as_index=False)
    .aggregate(
        n_barcodes=pd.NamedAgg("barcode", "count"),
        n_CCSs=pd.NamedAgg("nseqs", "sum"),
    )
    .rename(columns={"drop_reason": "category"})
    .melt(
        id_vars=["library", "category", "dropped", "nseqs"],
        value_vars=["n_barcodes", "n_CCSs"],
        var_name="type_of_count",
        value_name="count",
    )
)

# get drop reasons in order to plot
drop_reasons = (
    consensus_stats.groupby(["category", "dropped"], as_index=False)
    .aggregate({"count": "sum"})
    .sort_values(["dropped", "count"], ascending=[True, False])["category"]
    .tolist()
)
drop_colors = ["blue", "orange", "orangered", "goldenrod", "gold", "darkorange"]
assert len(drop_reasons) <= len(drop_colors)

consensus_stats_chart = (
    alt.Chart(consensus_stats)
    .encode(
        x=alt.X(
            "nseqs",
            scale=alt.Scale(domain=(1, max_plot_nseqs)),
            title="number of CCSs for barcode",
        ),
        y=alt.Y("count", stack=True),
        color=alt.Color(
            "category",
            sort=drop_reasons,
            scale=alt.Scale(range=drop_colors),
        ),
        row=alt.Row(
            "library",
            title=None,
        ),
        column=alt.Column("type_of_count"),
        tooltip=consensus_stats.columns.tolist(),
    )
    .mark_bar(size=0.75 * plot_width / max_plot_nseqs)
    .properties(width=plot_width, height=120)
    .configure_axis(grid=False)
    .resolve_scale(y="independent")
)

display(consensus_stats_chart)
Building consensus sequences with following settings
  max_sub_diffs=None
  max_indel_diffs=None
  max_minor_sub_frac=0.2
  max_minor_indel_frac=0.2
  min_support=3
Save as SVGSave as PNGView SourceView Compiled VegaOpen in Vega Editor

Write consensus sequences to barcode-variant lookup tables for subsequent use¶

We filter any consensus sequences with indels that are not in-frame codon-length deletions (so filtering out-of-frame indels), and then write the remaining sequences to barcode-variant lookup tables for later use:

In [15]:
# annotate with information on mutation types
consensus = alignparse.consensus.add_mut_info_cols(
    consensus,
    mutation_col="gene_mutations",
    sub_str_col="substitutions",
    n_indel_col="n_indels",
)

# plot summary stats
stats = (
    consensus.assign(
        out_of_frame_indel=lambda x: x["n_indels"] > 0,
        in_frame_codon_deletion=lambda x: (
            (~x["out_of_frame_indel"]) & (x["substitutions"].str.contains("-"))
        ),
        no_indel=lambda x: (~x["out_of_frame_indel"]) & (~x["in_frame_codon_deletion"]),
    )[["library", "out_of_frame_indel", "in_frame_codon_deletion", "no_indel"]]
    .groupby("library", as_index=False)
    .aggregate("sum")
    .melt(
        id_vars="library",
        var_name="category",
        value_name="n_barcodes",
    )
    .assign(retained=lambda x: x["category"] != "out_of_frame_indel")
)
stats_chart = (
    alt.Chart(stats)
    .encode(
        x=alt.X("n_barcodes"),
        y=alt.Y("category", title=None),
        color=alt.Color("retained"),
        tooltip=stats.columns.tolist(),
        facet=alt.Facet("library", columns=2),
    )
    .mark_bar()
    .properties(width=175, height=45)
)
display(stats_chart)

# write to file
consensus = consensus.query("n_indels == 0")
nt_variants = "results/variants/nt_variants.csv"
os.makedirs(os.path.dirname(nt_variants), exist_ok=True)
print(f"Writing {len(df)} consensus sequences to {nt_variants}")
_ = consensus.to_csv(
    consensus[["library", "barcode", "substitutions", "variant_call_support"]].to_csv(
        nt_variants,
        index=False,
    )
)
Save as SVGSave as PNGView SourceView Compiled VegaOpen in Vega Editor
Writing 2658151 consensus sequences to results/variants/nt_variants.csv