Build PacBio consensus sequences¶

This notebook builds consensus sequences for each barcode from the PacBio CCS sequencing.

Import Python modules:

In [1]:
import itertools
import os

import Bio.SeqIO

import alignparse.consensus
import alignparse.utils

import altair as alt

import dms_variants.barcodes

import numpy

import pandas as pd

import scipy.stats

import yaml
In [2]:
_ = alt.data_transformers.disable_max_rows()

Get configuration information:

In [3]:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.

with open("config.yaml") as f:
    config = yaml.safe_load(f)

Get the aligned CCSs:

In [4]:
aligned_ccs = pd.read_csv(
    "results/process_ccs/CCSs_aligned_to_amplicon.csv",
    na_filter=None,
)

Get the gene sequence:

In [5]:
geneseq = str(Bio.SeqIO.read(config["gene_sequence_codon"], "fasta").seq)

print(f"Read gene of length {len(geneseq)} nucleotides")
Read gene of length 1476 nucleotides

Identify strand exchange and filter CCSs known to have it¶

We may have variant tags at the end of eac read which allow us to quantify strand exchange and filter out some strand exchange (that between variants with different tags):

In [6]:
variant_tags = config["variant_tags"]

if not variant_tags:
    print("No variant tags specified, so cannot look for strand exchange")

else:
    display(pd.DataFrame(variant_tags))

    # invert dictionary so tag identities given by nucleotide identity
    variant_tags_by_nt = {
        tag: {val: key for key, val in vals.items()}
        for tag, vals in variant_tags.items()
    }
    assert all(
        len(d1) == len(d2)
        for d1, d2 in zip(variant_tags.items(), variant_tags_by_nt.items())
    )

    # function to assign strand exchange status
    def classify_strand_exchange(row):
        tags = list({row[tag] for tag in variant_tags})
        if len(tags) == 1:
            return tags[0]
        elif "wildtype" in tags:
            return "partially wildtype"
        elif "invalid" in tags:
            return "invalid nucleotide"
        else:
            return "strand exchange"

    # assign strand exchange status
    for tag, d in variant_tags_by_nt.items():
        aligned_ccs[tag] = aligned_ccs[f"{tag}_sequence"].map(d).fillna("invalid")
    aligned_ccs["strand_exchange"] = aligned_ccs.apply(classify_strand_exchange, axis=1)

    # get summary stats
    strand_exchange_stats = (
        aligned_ccs.groupby(["pacbioRun", "strand_exchange"], as_index=False)
        .aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
        .assign(
            fraction=lambda x: x["n_CCSs"]
            / x.groupby("pacbioRun")["n_CCSs"].transform("sum")
        )
    )

    # plot summary stats
    strand_exchange_chart = (
        alt.Chart(strand_exchange_stats)
        .encode(
            x="n_CCSs:Q",
            y=alt.Y(
                "strand_exchange:N",
                axis=alt.Axis(title=None),
            ),
            facet=alt.Facet("pacbioRun", columns=2, title=None),
            tooltip=strand_exchange_stats.columns.tolist(),
        )
        .mark_bar()
        .properties(width=250, height=100)
        .resolve_scale(x="independent")
    )
    display(strand_exchange_chart)

    # Filter out CCSs with strand exchange. Note that this approach is only
    # expected to catch ~half of CCSs with strand exchange.
    print(f"There are {len(aligned_ccs)} before filtering.")
    aligned_ccs = aligned_ccs.query('strand_exchange != "strand exchange"')
    print(f"There are {len(aligned_ccs)} after filtering.")
variant_tag5 variant_tag3
variant_1 G G
variant_2 C C
wildtype A A
There are 2025164 before filtering.
There are 2012023 after filtering.

Filter CCSs for accuracy¶

Plot the gene and barcode accuracy for each CCS, and only keep those above an accuracy threshold:

In [7]:
max_error_rate = config["max_ccs_error_rate"]  # require error rate <= this to keep CCS

log10_error_floor = -7  # error rates less than this set to this for plotting
log10_error_ceil = -2  # error rates greater than this set to this for plotting
nbins = 100  # bins for cumulative fraction plot

# calculate error rates
aligned_ccs = aligned_ccs.assign(
    gene_error=lambda x: 1 - x["gene_accuracy"],
    barcode_error=lambda x: 1 - x["barcode_accuracy"],
)

# calculate cumulative frequencies on log error rates
cumfrac = (
    aligned_ccs.melt(
        id_vars=["query_name", "pacbioRun"],
        value_vars=["barcode_error", "gene_error"],
        value_name="error",
    )
    .assign(log10_error=lambda x: numpy.log10(x["error"]).clip(lower=log10_error_floor))
    .groupby(["variable", "pacbioRun"])
    .apply(
        lambda g: pd.DataFrame(
            {
                "cumulative_count": scipy.stats.cumfreq(
                    g["log10_error"],
                    numbins=nbins,
                    defaultreallimits=(log10_error_floor, log10_error_ceil),
                )[0],
                "log10_error": numpy.linspace(
                    log10_error_floor, log10_error_ceil, nbins
                ),
            }
        )
    )
    .assign(
        meets_accuracy_cutoff=lambda x: x["log10_error"] <= numpy.log10(max_error_rate),
        cumulative_fraction=lambda x: x["cumulative_count"] / len(aligned_ccs),
    )
    .reset_index()
)

# plot cumulative frequencies
cumfrac_chart = (
    alt.Chart(cumfrac)
    .encode(
        x=alt.X(
            "log10_error",
            scale=alt.Scale(zero=False),
        ),
        y="cumulative_count",
        color="meets_accuracy_cutoff",
        column=alt.Column("variable", title=None),
        row=alt.Row("pacbioRun", title=None),
        tooltip=[
            alt.Tooltip("log10_error", format=".3f"),
            alt.Tooltip("cumulative_fraction", format=".3f"),
            alt.Tooltip("cumulative_count", format=".4g"),
        ],
    )
    .mark_point(filled=True, size=30)
    .properties(width=225, height=120)
)
display(cumfrac_chart)

Remove CCSs that do not meet accuracy threshold:

In [8]:
print(f"There are {len(aligned_ccs)} CCSs before accuracy filtering.")
aligned_ccs = aligned_ccs.query("barcode_error <= @max_error_rate").query(
    "gene_error <= @max_error_rate"
)
print(f"After filtering {len(aligned_ccs)} CCSs remain.")
There are 2012023 CCSs before accuracy filtering.
After filtering 1686281 CCSs remain.

Convert in-frame deletions to substitutions¶

If a deletion is in-frame, convert to substitution format using - as the substitution character:

In [9]:
deltosubs = alignparse.utils.InFrameDeletionsToSubs(geneseq)

aligned_ccs = (
    aligned_ccs.assign(
        mutations_inframe=lambda x: x["gene_mutations"].map(deltosubs.dels_to_subs),
        inframe_deletion=lambda x: x["gene_mutations"] != x["mutations_inframe"],
    )
    .drop(columns="gene_mutations")
    .rename(columns={"mutations_inframe": "gene_mutations"})
)

How many CCSs have in-frame deletions?

In [10]:
print("Number of CCSs with in-frame deletions:")
display(
    aligned_ccs.groupby("inframe_deletion")
    .aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
    .assign(fraction=lambda x: x["n_CCSs"] / x["n_CCSs"].sum())
    .round(3)
)
Number of CCSs with in-frame deletions:
n_CCSs fraction
inframe_deletion
False 1684719 0.999
True 1562 0.001

Empirical accuracy of CCSs¶

We can compute the empirical accuracy of individual CCSs by comparing mutations found in CCSs with the same barcode.

First, annotate which CCSs have indels (not in frame):

In [11]:
aligned_ccs = alignparse.consensus.add_mut_info_cols(
    aligned_ccs,
    mutation_col="gene_mutations",
    n_indel_col="n_indels",
    overwrite_cols=True,
).assign(has_indel=lambda x: x["n_indels"] > 0)

Now compute empirical accuracy, for all sequences and only those without indels, and at the error-rate filter we chose to use and one 10-fold higher. Note that the in-frame codon deletions are not classified as deletions since we have re-classified as substitutions above. The empirical accuracy is the estimated accuracy of each individual CCS:

In [12]:
# compute empirical accuracies
empirical_acc = []
for no_indel, acc_10x in itertools.product([True, False], [True, False]):
    df = aligned_ccs.copy()
    if not no_indel:
        df = df.query("n_indels == 0")
        label = "no indels"
    else:
        label = "allow indels"
    if acc_10x:
        max_error_rate_10x = max_error_rate / 10
        df = df.query("barcode_error <= @max_error_rate_10x").query(
            "gene_error <= @max_error_rate_10x"
        )
        label += ", 10x accuracy"
    for pacbioRun, run_df in df.groupby("pacbioRun"):
        empirical_acc.append(
            alignparse.consensus.empirical_accuracy(
                run_df, upstream_group_cols=None, mutation_col="gene_mutations"
            ).assign(label=label, pacbioRun=pacbioRun)
        )
empirical_acc = pd.concat(empirical_acc, ignore_index=True)

# plot empirical accuracy
print("Empirical accuracies:")
empirical_acc_chart = (
    alt.Chart(empirical_acc)
    .encode(
        x=alt.X(
            "accuracy:Q",
            scale=alt.Scale(domain=(0, 1)),
        ),
        y=alt.Y(
            "label:N",
            axis=alt.Axis(title=None),
        ),
        facet=alt.Facet(
            "pacbioRun",
            title=None,
            columns=2,
        ),
        tooltip=[alt.Tooltip("accuracy", format=".3f")],
    )
    .mark_point(filled=True, size=75)
    .properties(width=225, height=65)
)
display(empirical_acc_chart)
Empirical accuracies:

Rarefaction and other estimates of library diversity¶

To help understand how many barcodes there are and how evenly the number of reads are distributed among these barcodes, we make rarefaction curves. We also compute a few other metrics of diversity:

  1. number of barcodes observed >$n$ times up to the minimum number of CCSs required to call a consensus below.
  2. inverse Simpson index, which is reciprocal of probability two randomly drawn sequences have same barcode.
In [13]:
# required CCSs per barcode to call consensus below
min_support = config["consensus_params"]["min_support"]


# number of counts for each library / barcode
barcodecounts = aligned_ccs.groupby(["library", "barcode"], as_index=False).aggregate(
    count=pd.NamedAgg("query_name", "count")
)

# make rarefaction plot
rarefy_df = pd.concat(
    [
        (
            dms_variants.barcodes.rarefyBarcodes(
                df, maxpoints=1000, logspace=False
            ).assign(library=library)
        )
        for library, df in barcodecounts.groupby("library")
    ]
)
rarefy_chart = (
    alt.Chart(rarefy_df)
    .encode(
        x=alt.X(
            "ncounts",
            title="number of CCSs",
        ),
        y=alt.X(
            "nbarcodes",
            title="number of barcodes",
        ),
        color=alt.Color("library"),
        tooltip=rarefy_df.columns.tolist(),
    )
    .mark_point(size=10, filled=True)
    .resolve_scale(y="independent")
    .properties(height=175, width=300)
)
display(rarefy_chart)

# compute diversity statistics
diversity = dms_variants.barcodes.inverse_simpson_index(barcodecounts).melt(
    id_vars="library",
    value_vars="inverse_simpson_index",
    var_name="metric",
    value_name="diversity",
)
for n in range(min_support):
    diversity = pd.concat(
        [
            diversity,
            barcodecounts.query("count > @n")
            .groupby("library", as_index=False)
            .aggregate(diversity=pd.NamedAgg("barcode", "nunique"))
            .assign(metric=f"barcodes with >{n} CCSs"),
        ],
        ignore_index=True,
    )
diversity_chart = (
    alt.Chart(diversity)
    .encode(
        x=alt.X("diversity"),
        y=alt.Y("metric", title=None),
        facet=alt.Facet("library", columns=2, title=None),
        tooltip=diversity.columns.tolist(),
    )
    .mark_bar()
    .properties(width=200, height=15 * (min_support + 1))
)
display(diversity_chart)

Build consensus sequences¶

Use the alignparse.consensus.simple_mutconsensus method to build consensus sequences, and plot how many barcodes and CCSs contributed to valid consensuses or had to be dropped. Note the stats for barcodes and CCSs look different, because there are uneven numbers of CCSs per barcode:

In [14]:
# get parameters for building consensus
consensus_params = config["consensus_params"]
print(
    "Building consensus sequences with following settings\n  "
    + "\n  ".join(f"{param}={val}" for param, val in consensus_params.items())
)

# build consensus sequences and plot results
max_plot_nseqs = 15  # group nseqs >= this together
plot_width = 225

consensus, dropped = alignparse.consensus.simple_mutconsensus(
    aligned_ccs, mutation_col="gene_mutations", **consensus_params
)

consensus_stats = (
    pd.concat(
        [
            consensus.rename(columns={"variant_call_support": "nseqs"})
            .drop(columns="gene_mutations")
            .assign(drop_reason="retained", dropped=False),
            dropped.assign(dropped=True),
        ]
    )
    .assign(nseqs=lambda x: x["nseqs"].clip(upper=max_plot_nseqs))
    .groupby(["library", "drop_reason", "dropped", "nseqs"], as_index=False)
    .aggregate(
        n_barcodes=pd.NamedAgg("barcode", "count"),
        n_CCSs=pd.NamedAgg("nseqs", "sum"),
    )
    .rename(columns={"drop_reason": "category"})
    .melt(
        id_vars=["library", "category", "dropped", "nseqs"],
        value_vars=["n_barcodes", "n_CCSs"],
        var_name="type_of_count",
        value_name="count",
    )
)

# get drop reasons in order to plot
drop_reasons = (
    consensus_stats.groupby(["category", "dropped"], as_index=False)
    .aggregate({"count": "sum"})
    .sort_values(["dropped", "count"], ascending=[True, False])["category"]
    .tolist()
)
drop_colors = ["blue", "orange", "orangered", "goldenrod", "gold", "darkorange"]
assert len(drop_reasons) <= len(drop_colors)

consensus_stats_chart = (
    alt.Chart(consensus_stats)
    .encode(
        x=alt.X(
            "nseqs",
            scale=alt.Scale(domain=(1, max_plot_nseqs)),
            title="number of CCSs for barcode",
        ),
        y=alt.Y("count", stack=True),
        color=alt.Color(
            "category",
            sort=drop_reasons,
            scale=alt.Scale(range=drop_colors),
        ),
        row=alt.Row(
            "library",
            title=None,
        ),
        column=alt.Column("type_of_count"),
        tooltip=consensus_stats.columns.tolist(),
    )
    .mark_bar(size=0.75 * plot_width / max_plot_nseqs)
    .properties(width=plot_width, height=120)
    .configure_axis(grid=False)
    .resolve_scale(y="independent")
)

display(consensus_stats_chart)
Building consensus sequences with following settings
  max_sub_diffs=None
  max_indel_diffs=None
  max_minor_sub_frac=0.2
  max_minor_indel_frac=0.2
  min_support=3

Write consensus sequences to barcode-variant lookup tables for subsequent use¶

We filter any consensus sequences with indels that are not in-frame codon-length deletions (so filtering out-of-frame indels), and then write the remaining sequences to barcode-variant lookup tables for later use:

In [15]:
# annotate with information on mutation types
consensus = alignparse.consensus.add_mut_info_cols(
    consensus,
    mutation_col="gene_mutations",
    sub_str_col="substitutions",
    n_indel_col="n_indels",
)

# plot summary stats
stats = (
    consensus.assign(
        out_of_frame_indel=lambda x: x["n_indels"] > 0,
        in_frame_codon_deletion=lambda x: (
            (~x["out_of_frame_indel"]) & (x["substitutions"].str.contains("-"))
        ),
        no_indel=lambda x: (~x["out_of_frame_indel"]) & (~x["in_frame_codon_deletion"]),
    )[["library", "out_of_frame_indel", "in_frame_codon_deletion", "no_indel"]]
    .groupby("library", as_index=False)
    .aggregate("sum")
    .melt(
        id_vars="library",
        var_name="category",
        value_name="n_barcodes",
    )
    .assign(retained=lambda x: x["category"] != "out_of_frame_indel")
)
stats_chart = (
    alt.Chart(stats)
    .encode(
        x=alt.X("n_barcodes"),
        y=alt.Y("category", title=None),
        color=alt.Color("retained"),
        tooltip=stats.columns.tolist(),
        facet=alt.Facet("library", columns=2),
    )
    .mark_bar()
    .properties(width=175, height=45)
)
display(stats_chart)

# write to file
consensus = consensus.query("n_indels == 0")
nt_variants = "results/variants/nt_variants.csv"
os.makedirs(os.path.dirname(nt_variants), exist_ok=True)
print(f"Writing {len(df)} consensus sequences to {nt_variants}")
_ = consensus.to_csv(
    consensus[["library", "barcode", "substitutions", "variant_call_support"]].to_csv(
        nt_variants,
        index=False,
    )
)
Writing 1585446 consensus sequences to results/variants/nt_variants.csv