import os

import alignparse.targets

import altair as alt

import pandas as pd

pacbio_amplicon = None
pacbio_amplicon_specs = None
pacbio_runs_csv = None

# Parameters
pacbio_amplicon = "data/PacBio_amplicon.gb"
pacbio_amplicon_specs = "data/PacBio_feature_parse_specs.yaml"
pacbio_runs_csv = "data/PacBio_runs.csv"

pacbio_runs = (
    pd.read_csv(pacbio_runs_csv)
    .assign(subdir=lambda x: "results/process_ccs/" + x["run"])
    .rename(columns={"run": "pacbioRun"})
)

pacbio_runs

readstats = pd.concat(
    [
        (
            pd.read_csv(os.path.join(row.subdir, "readstats.csv")).assign(
                pacbioRun=row.pacbioRun, library=row.library
            )
        )
        for row in pacbio_runs.itertuples()
    ],
    ignore_index=True,
)

readstats_chart = (
    alt.Chart(readstats)
    .encode(
        x="count:Q",
        y=alt.Y(
            "category:N",
            axis=alt.Axis(title=None),
        ),
        tooltip=readstats.columns.tolist(),
        facet=alt.Facet("pacbioRun:N", columns=2, title=None),
    )
    .mark_bar()
    .properties(width=250, height=50)
    .resolve_scale(x="independent", y="independent")
)

readstats_chart

targets = alignparse.targets.Targets(
    seqsfile=pacbio_amplicon,
    feature_parse_specs=pacbio_amplicon_specs,
)

fig = targets.plot(
    ax_width=7,
    plots_indexing="genbank",
    ax_height=2,
    hspace=1.2,
)

# CSVs holding filtered reads
filtered_csvs = pd.concat(
    [
        (
            pd.read_csv(os.path.join(row.subdir, "filtered.csv")).assign(
                pacbioRun=row.pacbioRun, library=row.library
            )
        )
        for row in pacbio_runs.itertuples()
    ],
    ignore_index=True,
)

# details for all filtered reads
filtered = pd.concat(
    [
        pd.read_csv(row.csv_file).assign(
            target=row.target, pacbioRun=row.pacbioRun, library=row.library
        )
        for row in filtered_csvs.itertuples()
    ],
    ignore_index=True,
)

# count reasons for filtering, then add number of non-filtered
filtered_reasons = pd.concat(
    [
        filtered.groupby(["pacbioRun", "filter_reason"], as_index=False).aggregate(
            count=pd.NamedAgg("query_name", "count")
        ),
        readstats.query('category.str.startswith("aligned")', engine="python")
        .groupby("pacbioRun", as_index=False)
        .aggregate({"count": "sum"})
        .assign(filter_reason="aligned"),
    ]
).assign(
    total_counts=lambda x: x.groupby("pacbioRun")["count"].transform("sum"),
    frac_counts=lambda x: x["count"] / x["total_counts"],
)

# make chart
filtered_chart = (
    alt.Chart(filtered_reasons)
    .encode(
        x="count:Q",
        y=alt.Y(
            "filter_reason:N",
            axis=alt.Axis(title=None),
        ),
        color="is_aligned:N",
        tooltip=filtered_reasons.columns.tolist(),
        facet=alt.Facet("pacbioRun:N", columns=2, title=None),
    )
    .mark_bar()
    .properties(width=250, height=75)
    .resolve_scale(x="independent", y="independent")
    .transform_filter(alt.datum.frac_counts > 0.01)
    .transform_calculate(is_aligned=alt.datum.filter_reason == "aligned")
)

filtered_chart

# CSVs holding aligned reads
aligned_csvs = pd.concat(
    [
        (
            pd.read_csv(os.path.join(row.subdir, "aligned.csv")).assign(
                pacbioRun=row.pacbioRun, library=row.library
            )
        )
        for row in pacbio_runs.itertuples()
    ],
    ignore_index=True,
)

assert aligned_csvs["target"].nunique() == 1

aligned = pd.concat(
    [
        (
            pd.read_csv(row.csv_file)
            .assign(pacbioRun=row.pacbioRun, library=row.library)
            .drop(columns=["query_clip5", "query_clip3"])
            .rename(columns={"barcode_sequence": "barcode"})
        )
        for row in aligned_csvs.itertuples()
    ],
    ignore_index=True,
)
print(f"\nRead {len(aligned):.4g} alignable CCSs:")
display(
    aligned.groupby("pacbioRun").aggregate(n_CCSs=pd.NamedAgg("query_name", "count"))
)
output_csv = "results/process_ccs/CCSs_aligned_to_amplicon.csv"
print(f"Writing to {output_csv}")
aligned.to_csv(output_csv, index=False)

Read 1.293e+06 alignable CCSs:

Writing to results/process_ccs/CCSs_aligned_to_amplicon.csv

	library	pacbioRun	fastq	subdir
0	Lib1	Lib1-230612	/fh/fast/bloom_j/computational_notebooks/bdado...	results/process_ccs/Lib1-230612
1	Lib1	Lib1-230613	/fh/fast/bloom_j/computational_notebooks/bdado...	results/process_ccs/Lib1-230613
2	Lib2	Lib2-230612	/fh/fast/bloom_j/computational_notebooks/bdado...	results/process_ccs/Lib2-230612
3	Lib2	Lib2-230613	/fh/fast/bloom_j/computational_notebooks/bdado...	results/process_ccs/Lib2-230613

	n_CCSs
pacbioRun
Lib1-230612	461313
Lib1-230613	294625
Lib2-230612	327872
Lib2-230613	209088

Analyze PacBio CCSs¶

Stats on CCS alignments¶

Visualize target to which reads are being aligned¶

Why were some CCSs filtered?¶

Get CCSs that align to the amplicon¶