Analyze PacBio CCSs¶

This notebook analyzes the PacBio CCSs that have been previously aligned and parsed with alignparse.

Import Python modules:

In [1]:
import os

import alignparse.targets

import altair as alt

import pandas as pd

The next cell is tagged as parameters for papermill parameterization:

In [2]:
pacbio_amplicon = None
pacbio_amplicon_specs = None
pacbio_runs_csv = None
In [3]:
# Parameters
pacbio_amplicon = "data/PacBio_amplicon.gb"
pacbio_amplicon_specs = "data/PacBio_feature_parse_specs.yaml"
pacbio_runs_csv = "data/PacBio_runs.csv"

Read in the PacBio runs:

In [4]:
pacbio_runs = (
    pd.read_csv(pacbio_runs_csv)
    .assign(subdir=lambda x: "results/process_ccs/" + x["run"])
    .rename(columns={"run": "pacbioRun"})
)

pacbio_runs
Out[4]:
library pacbioRun fastq subdir
0 E3E2_A E3E2_A_240930 /fh/fast/bloom_j/SR/ngs/pacbio/240930_XiaohuiJ... results/process_ccs/E3E2_A_240930
1 E3E2_B E3E2_B_240930 /fh/fast/bloom_j/SR/ngs/pacbio/240930_XiaohuiJ... results/process_ccs/E3E2_B_240930
2 6KE1_A 6KE1_A_240930 /fh/fast/bloom_j/SR/ngs/pacbio/240930_XiaohuiJ... results/process_ccs/6KE1_A_240930
3 6KE1_B 6KE1_B_240930 /fh/fast/bloom_j/SR/ngs/pacbio/240930_XiaohuiJ... results/process_ccs/6KE1_B_240930
4 E_A E_E3E2_A_240930 /fh/fast/bloom_j/SR/ngs/pacbio/240930_XiaohuiJ... results/process_ccs/E_E3E2_A_240930
5 E_B E_E3E2_B_240930 /fh/fast/bloom_j/SR/ngs/pacbio/240930_XiaohuiJ... results/process_ccs/E_E3E2_B_240930
6 E_A E_6KE1_A_240930 /fh/fast/bloom_j/SR/ngs/pacbio/240930_XiaohuiJ... results/process_ccs/E_6KE1_A_240930
7 E_B E_6KE1_B_240930 /fh/fast/bloom_j/SR/ngs/pacbio/240930_XiaohuiJ... results/process_ccs/E_6KE1_B_240930

Stats on CCS alignments¶

Read and plot the alignment stats from running alignparse on the PacBio CCSs:

In [5]:
readstats = pd.concat(
    [
        (
            pd.read_csv(os.path.join(row.subdir, "readstats.csv")).assign(
                pacbioRun=row.pacbioRun, library=row.library
            )
        )
        for row in pacbio_runs.itertuples()
    ],
    ignore_index=True,
)

readstats_chart = (
    alt.Chart(readstats)
    .encode(
        x="count:Q",
        y=alt.Y(
            "category:N",
            axis=alt.Axis(title=None),
        ),
        tooltip=readstats.columns.tolist(),
        facet=alt.Facet("pacbioRun:N", columns=2, title=None),
    )
    .mark_bar()
    .properties(width=250, height=50)
    .resolve_scale(x="independent", y="independent")
)

readstats_chart
Out[5]:

Visualize target to which reads are being aligned¶

Draw images of the target we're parsing:

In [6]:
targets = alignparse.targets.Targets(
    seqsfile=pacbio_amplicon,
    feature_parse_specs=pacbio_amplicon_specs,
)

fig = targets.plot(
    ax_width=7,
    plots_indexing="genbank",
    ax_height=2,
    hspace=1.2,
)
/fh/fast/bloom_j/computational_notebooks/jbloom/2025/CHIKV-181-25-E-DMS/.snakemake/conda/e937d0191e9e0d4fae02985bc3f90aba_/lib/python3.12/site-packages/Bio/GenBank/Scanner.py:1537: BiopythonParserWarning: Attempting to parse malformed locus line:
'LOCUS       PacBio_amplicon        3153 bp DNA     linear   SYN 09-OCT-2024\n'
Found locus 'PacBio_amplicon' size '3153' residue_type 'DNA'
Some fields may be wrong.
  warnings.warn(
/fh/fast/bloom_j/computational_notebooks/jbloom/2025/CHIKV-181-25-E-DMS/.snakemake/conda/e937d0191e9e0d4fae02985bc3f90aba_/lib/python3.12/site-packages/Bio/GenBank/Scanner.py:1214: BiopythonParserWarning: Premature end of file in sequence data
  warnings.warn(
No description has been provided for this image

Why were some CCSs filtered?¶

Plot the number of CCSs filtered for each reason:

In [7]:
# CSVs holding filtered reads
filtered_csvs = pd.concat(
    [
        (
            pd.read_csv(os.path.join(row.subdir, "filtered.csv")).assign(
                pacbioRun=row.pacbioRun, library=row.library
            )
        )
        for row in pacbio_runs.itertuples()
    ],
    ignore_index=True,
)

# details for all filtered reads
filtered = pd.concat(
    [
        pd.read_csv(row.csv_file).assign(
            target=row.target, pacbioRun=row.pacbioRun, library=row.library
        )
        for row in filtered_csvs.itertuples()
    ],
    ignore_index=True,
)

# count reasons for filtering, then add number of non-filtered
filtered_reasons = pd.concat(
    [
        filtered.groupby(["pacbioRun", "filter_reason"], as_index=False).aggregate(
            count=pd.NamedAgg("query_name", "count")
        ),
        readstats.query('category.str.startswith("aligned")', engine="python")
        .groupby("pacbioRun", as_index=False)
        .aggregate({"count": "sum"})
        .assign(filter_reason="aligned"),
    ]
).assign(
    total_counts=lambda x: x.groupby("pacbioRun")["count"].transform("sum"),
    frac_counts=lambda x: x["count"] / x["total_counts"],
)

# make chart
filtered_chart = (
    alt.Chart(filtered_reasons)
    .encode(
        x="count:Q",
        y=alt.Y(
            "filter_reason:N",
            axis=alt.Axis(title=None),
        ),
        color="is_aligned:N",
        tooltip=filtered_reasons.columns.tolist(),
        facet=alt.Facet("pacbioRun:N", columns=2, title=None),
    )
    .mark_bar()
    .properties(width=250, height=75)
    .resolve_scale(x="independent", y="independent")
    .transform_filter(alt.datum.frac_counts > 0.01)
    .transform_calculate(is_aligned=alt.datum.filter_reason == "aligned")
)

filtered_chart
Out[7]:

Get CCSs that align to the amplicon¶

In [8]:
# CSVs holding aligned reads
aligned_csvs = pd.concat(
    [
        (
            pd.read_csv(os.path.join(row.subdir, "aligned.csv")).assign(
                pacbioRun=row.pacbioRun, library=row.library
            )
        )
        for row in pacbio_runs.itertuples()
    ],
    ignore_index=True,
)

assert aligned_csvs["target"].nunique() == 1

aligned = pd.concat(
    [
        (
            pd.read_csv(row.csv_file)
            .assign(pacbioRun=row.pacbioRun, library=row.library)
            .drop(columns=["query_clip5", "query_clip3"])
            .rename(columns={"barcode_sequence": "barcode"})
        )
        for row in aligned_csvs.itertuples()
    ],
    ignore_index=True,
)
print(f"\nRead {len(aligned):.4g} alignable CCSs:")
display(
    aligned.groupby("pacbioRun").aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
)
output_csv = "results/process_ccs/CCSs_aligned_to_amplicon.csv"
print(f"Writing to {output_csv}")
aligned.to_csv(output_csv, index=False)
Read 1.059e+07 alignable CCSs:
n_CCSs
pacbioRun
6KE1_A_240930 1625103
6KE1_B_240930 1049469
E3E2_A_240930 1299377
E3E2_B_240930 1320603
E_6KE1_A_240930 1625103
E_6KE1_B_240930 1049469
E_E3E2_A_240930 1299377
E_E3E2_B_240930 1320603
Writing to results/process_ccs/CCSs_aligned_to_amplicon.csv
In [ ]: