import itertools
import os

import Bio.SeqIO

import alignparse.consensus
import alignparse.utils

import altair as alt

import dms_variants.barcodes

import numpy

import pandas as pd

import scipy.stats

import yaml

_ = alt.data_transformers.disable_max_rows()

# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.

with open("config.yaml") as f:
    config = yaml.safe_load(f)

aligned_ccs = pd.read_csv(
    "results/process_ccs/CCSs_aligned_to_amplicon.csv",
    na_filter=None,
)

geneseq = str(Bio.SeqIO.read(config["gene_sequence_codon"], "fasta").seq)

print(f"Read gene of length {len(geneseq)} nucleotides")

Read gene of length 1476 nucleotides

variant_tags = config["variant_tags"]

if not variant_tags:
    print("No variant tags specified, so cannot look for strand exchange")

else:
    display(pd.DataFrame(variant_tags))

    # invert dictionary so tag identities given by nucleotide identity
    variant_tags_by_nt = {
        tag: {val: key for key, val in vals.items()}
        for tag, vals in variant_tags.items()
    }
    assert all(
        len(d1) == len(d2)
        for d1, d2 in zip(variant_tags.items(), variant_tags_by_nt.items())
    )

    # function to assign strand exchange status
    def classify_strand_exchange(row):
        tags = list({row[tag] for tag in variant_tags})
        if len(tags) == 1:
            return tags[0]
        elif "wildtype" in tags:
            return "partially wildtype"
        elif "invalid" in tags:
            return "invalid nucleotide"
        else:
            return "strand exchange"

    # assign strand exchange status
    for tag, d in variant_tags_by_nt.items():
        aligned_ccs[tag] = aligned_ccs[f"{tag}_sequence"].map(d).fillna("invalid")
    aligned_ccs["strand_exchange"] = aligned_ccs.apply(classify_strand_exchange, axis=1)

    # get summary stats
    strand_exchange_stats = (
        aligned_ccs.groupby(["pacbioRun", "strand_exchange"], as_index=False)
        .aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
        .assign(
            fraction=lambda x: x["n_CCSs"]
            / x.groupby("pacbioRun")["n_CCSs"].transform("sum")
        )
    )

    # plot summary stats
    strand_exchange_chart = (
        alt.Chart(strand_exchange_stats)
        .encode(
            x="n_CCSs:Q",
            y=alt.Y(
                "strand_exchange:N",
                axis=alt.Axis(title=None),
            ),
            facet=alt.Facet("pacbioRun", columns=2, title=None),
            tooltip=strand_exchange_stats.columns.tolist(),
        )
        .mark_bar()
        .properties(width=250, height=100)
        .resolve_scale(x="independent")
    )
    display(strand_exchange_chart)

    # Filter out CCSs with strand exchange. Note that this approach is only
    # expected to catch ~half of CCSs with strand exchange.
    print(f"There are {len(aligned_ccs)} before filtering.")
    aligned_ccs = aligned_ccs.query('strand_exchange != "strand exchange"')
    print(f"There are {len(aligned_ccs)} after filtering.")

There are 2025164 before filtering.

There are 2012023 after filtering.

max_error_rate = config["max_ccs_error_rate"]  # require error rate <= this to keep CCS

log10_error_floor = -7  # error rates less than this set to this for plotting
log10_error_ceil = -2  # error rates greater than this set to this for plotting
nbins = 100  # bins for cumulative fraction plot

# calculate error rates
aligned_ccs = aligned_ccs.assign(
    gene_error=lambda x: 1 - x["gene_accuracy"],
    barcode_error=lambda x: 1 - x["barcode_accuracy"],
)

# calculate cumulative frequencies on log error rates
cumfrac = (
    aligned_ccs.melt(
        id_vars=["query_name", "pacbioRun"],
        value_vars=["barcode_error", "gene_error"],
        value_name="error",
    )
    .assign(log10_error=lambda x: numpy.log10(x["error"]).clip(lower=log10_error_floor))
    .groupby(["variable", "pacbioRun"])
    .apply(
        lambda g: pd.DataFrame(
            {
                "cumulative_count": scipy.stats.cumfreq(
                    g["log10_error"],
                    numbins=nbins,
                    defaultreallimits=(log10_error_floor, log10_error_ceil),
                )[0],
                "log10_error": numpy.linspace(
                    log10_error_floor, log10_error_ceil, nbins
                ),
            }
        )
    )
    .assign(
        meets_accuracy_cutoff=lambda x: x["log10_error"] <= numpy.log10(max_error_rate),
        cumulative_fraction=lambda x: x["cumulative_count"] / len(aligned_ccs),
    )
    .reset_index()
)

# plot cumulative frequencies
cumfrac_chart = (
    alt.Chart(cumfrac)
    .encode(
        x=alt.X(
            "log10_error",
            scale=alt.Scale(zero=False),
        ),
        y="cumulative_count",
        color="meets_accuracy_cutoff",
        column=alt.Column("variable", title=None),
        row=alt.Row("pacbioRun", title=None),
        tooltip=[
            alt.Tooltip("log10_error", format=".3f"),
            alt.Tooltip("cumulative_fraction", format=".3f"),
            alt.Tooltip("cumulative_count", format=".4g"),
        ],
    )
    .mark_point(filled=True, size=30)
    .properties(width=225, height=120)
)
display(cumfrac_chart)

print(f"There are {len(aligned_ccs)} CCSs before accuracy filtering.")
aligned_ccs = aligned_ccs.query("barcode_error <= @max_error_rate").query(
    "gene_error <= @max_error_rate"
)
print(f"After filtering {len(aligned_ccs)} CCSs remain.")

There are 2012023 CCSs before accuracy filtering.

After filtering 1686281 CCSs remain.

deltosubs = alignparse.utils.InFrameDeletionsToSubs(geneseq)

aligned_ccs = (
    aligned_ccs.assign(
        mutations_inframe=lambda x: x["gene_mutations"].map(deltosubs.dels_to_subs),
        inframe_deletion=lambda x: x["gene_mutations"] != x["mutations_inframe"],
    )
    .drop(columns="gene_mutations")
    .rename(columns={"mutations_inframe": "gene_mutations"})
)

print("Number of CCSs with in-frame deletions:")
display(
    aligned_ccs.groupby("inframe_deletion")
    .aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
    .assign(fraction=lambda x: x["n_CCSs"] / x["n_CCSs"].sum())
    .round(3)
)

Number of CCSs with in-frame deletions:

aligned_ccs = alignparse.consensus.add_mut_info_cols(
    aligned_ccs,
    mutation_col="gene_mutations",
    n_indel_col="n_indels",
    overwrite_cols=True,
).assign(has_indel=lambda x: x["n_indels"] > 0)

# compute empirical accuracies
empirical_acc = []
for no_indel, acc_10x in itertools.product([True, False], [True, False]):
    df = aligned_ccs.copy()
    if not no_indel:
        df = df.query("n_indels == 0")
        label = "no indels"
    else:
        label = "allow indels"
    if acc_10x:
        max_error_rate_10x = max_error_rate / 10
        df = df.query("barcode_error <= @max_error_rate_10x").query(
            "gene_error <= @max_error_rate_10x"
        )
        label += ", 10x accuracy"
    for pacbioRun, run_df in df.groupby("pacbioRun"):
        empirical_acc.append(
            alignparse.consensus.empirical_accuracy(
                run_df, upstream_group_cols=None, mutation_col="gene_mutations"
            ).assign(label=label, pacbioRun=pacbioRun)
        )
empirical_acc = pd.concat(empirical_acc, ignore_index=True)

# plot empirical accuracy
print("Empirical accuracies:")
empirical_acc_chart = (
    alt.Chart(empirical_acc)
    .encode(
        x=alt.X(
            "accuracy:Q",
            scale=alt.Scale(domain=(0, 1)),
        ),
        y=alt.Y(
            "label:N",
            axis=alt.Axis(title=None),
        ),
        facet=alt.Facet(
            "pacbioRun",
            title=None,
            columns=2,
        ),
        tooltip=[alt.Tooltip("accuracy", format=".3f")],
    )
    .mark_point(filled=True, size=75)
    .properties(width=225, height=65)
)
display(empirical_acc_chart)

Empirical accuracies:

# required CCSs per barcode to call consensus below
min_support = config["consensus_params"]["min_support"]


# number of counts for each library / barcode
barcodecounts = aligned_ccs.groupby(["library", "barcode"], as_index=False).aggregate(
    count=pd.NamedAgg("query_name", "count")
)

# make rarefaction plot
rarefy_df = pd.concat(
    [
        (
            dms_variants.barcodes.rarefyBarcodes(
                df, maxpoints=1000, logspace=False
            ).assign(library=library)
        )
        for library, df in barcodecounts.groupby("library")
    ]
)
rarefy_chart = (
    alt.Chart(rarefy_df)
    .encode(
        x=alt.X(
            "ncounts",
            title="number of CCSs",
        ),
        y=alt.X(
            "nbarcodes",
            title="number of barcodes",
        ),
        color=alt.Color("library"),
        tooltip=rarefy_df.columns.tolist(),
    )
    .mark_point(size=10, filled=True)
    .resolve_scale(y="independent")
    .properties(height=175, width=300)
)
display(rarefy_chart)

# compute diversity statistics
diversity = dms_variants.barcodes.inverse_simpson_index(barcodecounts).melt(
    id_vars="library",
    value_vars="inverse_simpson_index",
    var_name="metric",
    value_name="diversity",
)
for n in range(min_support):
    diversity = pd.concat(
        [
            diversity,
            barcodecounts.query("count > @n")
            .groupby("library", as_index=False)
            .aggregate(diversity=pd.NamedAgg("barcode", "nunique"))
            .assign(metric=f"barcodes with >{n} CCSs"),
        ],
        ignore_index=True,
    )
diversity_chart = (
    alt.Chart(diversity)
    .encode(
        x=alt.X("diversity"),
        y=alt.Y("metric", title=None),
        facet=alt.Facet("library", columns=2, title=None),
        tooltip=diversity.columns.tolist(),
    )
    .mark_bar()
    .properties(width=200, height=15 * (min_support + 1))
)
display(diversity_chart)

# get parameters for building consensus
consensus_params = config["consensus_params"]
print(
    "Building consensus sequences with following settings\n  "
    + "\n  ".join(f"{param}={val}" for param, val in consensus_params.items())
)

# build consensus sequences and plot results
max_plot_nseqs = 15  # group nseqs >= this together
plot_width = 225

consensus, dropped = alignparse.consensus.simple_mutconsensus(
    aligned_ccs, mutation_col="gene_mutations", **consensus_params
)

consensus_stats = (
    pd.concat(
        [
            consensus.rename(columns={"variant_call_support": "nseqs"})
            .drop(columns="gene_mutations")
            .assign(drop_reason="retained", dropped=False),
            dropped.assign(dropped=True),
        ]
    )
    .assign(nseqs=lambda x: x["nseqs"].clip(upper=max_plot_nseqs))
    .groupby(["library", "drop_reason", "dropped", "nseqs"], as_index=False)
    .aggregate(
        n_barcodes=pd.NamedAgg("barcode", "count"),
        n_CCSs=pd.NamedAgg("nseqs", "sum"),
    )
    .rename(columns={"drop_reason": "category"})
    .melt(
        id_vars=["library", "category", "dropped", "nseqs"],
        value_vars=["n_barcodes", "n_CCSs"],
        var_name="type_of_count",
        value_name="count",
    )
)

# get drop reasons in order to plot
drop_reasons = (
    consensus_stats.groupby(["category", "dropped"], as_index=False)
    .aggregate({"count": "sum"})
    .sort_values(["dropped", "count"], ascending=[True, False])["category"]
    .tolist()
)
drop_colors = ["blue", "orange", "orangered", "goldenrod", "gold", "darkorange"]
assert len(drop_reasons) <= len(drop_colors)

consensus_stats_chart = (
    alt.Chart(consensus_stats)
    .encode(
        x=alt.X(
            "nseqs",
            scale=alt.Scale(domain=(1, max_plot_nseqs)),
            title="number of CCSs for barcode",
        ),
        y=alt.Y("count", stack=True),
        color=alt.Color(
            "category",
            sort=drop_reasons,
            scale=alt.Scale(range=drop_colors),
        ),
        row=alt.Row(
            "library",
            title=None,
        ),
        column=alt.Column("type_of_count"),
        tooltip=consensus_stats.columns.tolist(),
    )
    .mark_bar(size=0.75 * plot_width / max_plot_nseqs)
    .properties(width=plot_width, height=120)
    .configure_axis(grid=False)
    .resolve_scale(y="independent")
)

display(consensus_stats_chart)

Building consensus sequences with following settings
  max_sub_diffs=None
  max_indel_diffs=None
  max_minor_sub_frac=0.2
  max_minor_indel_frac=0.2
  min_support=3

# annotate with information on mutation types
consensus = alignparse.consensus.add_mut_info_cols(
    consensus,
    mutation_col="gene_mutations",
    sub_str_col="substitutions",
    n_indel_col="n_indels",
)

# plot summary stats
stats = (
    consensus.assign(
        out_of_frame_indel=lambda x: x["n_indels"] > 0,
        in_frame_codon_deletion=lambda x: (
            (~x["out_of_frame_indel"]) & (x["substitutions"].str.contains("-"))
        ),
        no_indel=lambda x: (~x["out_of_frame_indel"]) & (~x["in_frame_codon_deletion"]),
    )[["library", "out_of_frame_indel", "in_frame_codon_deletion", "no_indel"]]
    .groupby("library", as_index=False)
    .aggregate("sum")
    .melt(
        id_vars="library",
        var_name="category",
        value_name="n_barcodes",
    )
    .assign(retained=lambda x: x["category"] != "out_of_frame_indel")
)
stats_chart = (
    alt.Chart(stats)
    .encode(
        x=alt.X("n_barcodes"),
        y=alt.Y("category", title=None),
        color=alt.Color("retained"),
        tooltip=stats.columns.tolist(),
        facet=alt.Facet("library", columns=2),
    )
    .mark_bar()
    .properties(width=175, height=45)
)
display(stats_chart)

# write to file
consensus = consensus.query("n_indels == 0")
nt_variants = "results/variants/nt_variants.csv"
os.makedirs(os.path.dirname(nt_variants), exist_ok=True)
print(f"Writing {len(df)} consensus sequences to {nt_variants}")
_ = consensus.to_csv(
    consensus[["library", "barcode", "substitutions", "variant_call_support"]].to_csv(
        nt_variants,
        index=False,
    )
)

Writing 1585446 consensus sequences to results/variants/nt_variants.csv

	variant_tag5	variant_tag3
variant_1	G	G
variant_2	C	C
wildtype	A	A

Build PacBio consensus sequences¶

Identify strand exchange and filter CCSs known to have it¶

Filter CCSs for accuracy¶

Convert in-frame deletions to substitutions¶

Empirical accuracy of CCSs¶

Rarefaction and other estimates of library diversity¶

Build consensus sequences¶

Write consensus sequences to barcode-variant lookup tables for subsequent use¶

	n_CCSs	fraction
inframe_deletion
False	1684719	0.999
True	1562	0.001