import pandas as pd

import polyclonal
import polyclonal.plot

import seaborn

# this cell is tagged parameters for `papermill` parameterization
site_numbering_map_csv = None
shifts_csv = None
shifts_html = None
params = None

# Parameters
params = {
    "avg_method": "median",
    "per_comparison_tooltips": True,
    "plot_kwargs": {
        "addtl_slider_stats": {"times_seen": 3},
        "heatmap_max_at_least": 0.5,
        "heatmap_min_at_least": -0.5,
        "init_floor_at_zero": False,
        "init_site_statistic": "mean",
        "site_zoom_bar_color_col": "region",
        "slider_binding_range_kwargs": {
            "times_seen": {"step": 1, "min": 1, "max": 25},
            "n_comparisons": {"step": 1},
        },
    },
    "legend": "Interactive plot of shifts in effects of mutations. Negative values indicate mutations that\nhave shifted to become more deleterious; positive values indicate mutations shifted to\nbe more beneficial.\n\nUse the site zoom bar at the top to zoom in on specific sites. The line plot shows a summary\nstatistic indicating the effects of mutations at each site. The heat map shows the effects of\nindividual mutations, with parental amino-acid identities indicated by x and gray\nindicating non-measured mutations.\n\nYou can mouse over points to get details about individual measurements, including measurements\nin individual comparisons.\n\nThe options at the bottom of the plot let you modify the display, such as by selecting how\n many different variants a mutation must be seen in to be shown (*minimum times_seen*),\n how many different experimental comparisons the mutation was measured in\n(*minimum n_comparisons*), what site summary statistic to show, etc.\n",
    "title": "Comparison of functional effects measured on 293T cells expressing different levels of ACE2",
    "comparisons": ["Lib1-230614_ACE2_expression", "Lib2-230614_ACE2_expression"],
    "lasso_shift": 0.0001,
}
site_numbering_map_csv = "data/site_numbering_map.csv"
shifts_csv = "results/func_effect_shifts/averages/target_cell_ACE2_expression_comparison_shifts.csv"
shifts_html = "results/func_effect_shifts/averages/target_cell_ACE2_expression_comparison_shifts_nolegend.html"

comparisons = params["comparisons"]

shifts = [
    pd.read_csv(f"results/func_effect_shifts/by_comparison/{c}_shifts.csv").assign(
        comparison=c,
        lasso_shift=lambda x: x["lasso_shift"].astype(float),
    )
    for c in comparisons
]

# check all shift comparisons are comparable:
for shift_df in shifts[1:]:
    if (shift_df.columns != shifts[0].columns).any():
        raise ValueError("comparisons do not all have the same columns")
    if set(shift_df["lasso_shift"]) != set(shifts[0]["lasso_shift"]):
        raise ValueError("comparisons do not all have the same `lasso_shifts`")

shifts = pd.concat(shifts)

# add a times_seen column that is the average of all of the times_seen in all conditions
times_seen_cols = [c for c in shifts.columns if c.startswith("times_seen_")]
shifts["times_seen"] = shifts[times_seen_cols].mean(axis=1)

# get shifts in tidy format
shift_cols = [c for c in shifts.columns if c.startswith("shift_")]
shifts_tidy = shifts.melt(
    id_vars=[
        "comparison",
        "site",
        "wildtype",
        "mutant",
        "lasso_shift",
        "times_seen",
        "latent_phenotype_effect",
    ],
    value_vars=shift_cols,
    var_name="condition",
    value_name="shift",
)

# average times_seen & latent_phenotype_effect across comparisons, pivot on comparisons
shifts_comparison_pivoted = (
    shifts_tidy.assign(
        times_seen=lambda x: x.groupby(["site", "mutant", "lasso_shift"])[
            "times_seen"
        ].transform("mean"),
        latent_phenotype_effect=lambda x: x.groupby(["site", "mutant", "lasso_shift"])[
            "latent_phenotype_effect"
        ].transform("mean"),
    )
    .pivot_table(
        index=[
            "site",
            "wildtype",
            "mutant",
            "latent_phenotype_effect",
            "times_seen",
            "lasso_shift",
            "condition",
        ],
        values="shift",
        columns="comparison",
    )
    .reset_index()
)

try:
    times_seen = params["plot_kwargs"]["addtl_slider_stats"]["times_seen"]
except KeyError:
    times_seen = 3

print(f"Only plotting mutations with times_seen >= {times_seen}")

for lasso_shift, df in shifts_comparison_pivoted.groupby("lasso_shift"):
    grid = seaborn.pairplot(
        df.query("times_seen >= 3").query("wildtype != mutant"),
        vars=comparisons,
        hue=(
            None
            if shifts_comparison_pivoted["condition"].nunique() == 1
            else "condition"
        ),
        plot_kws={"alpha": 0.3, "s": 25},
    )
    grid.fig.suptitle(f"lasso shift = {lasso_shift}")
    grid.fig.tight_layout()

Only plotting mutations with times_seen >= 3

lasso_shift = float(params["lasso_shift"])
avg_method = params["avg_method"]

assert lasso_shift in set(shifts_comparison_pivoted["lasso_shift"])
assert avg_method in {"mean", "median"}, avg_method

site_numbering_map = pd.read_csv(site_numbering_map_csv).rename(
    columns={"reference_site": "site"}
)
addtl_site_cols = [
    c for c in site_numbering_map.columns if c != "site" and c.endswith("site")
]

# get the data to plot
df = (
    shifts_comparison_pivoted.query("lasso_shift == @lasso_shift")
    .drop(columns="lasso_shift")
    .merge(
        site_numbering_map[["site", *addtl_site_cols, "region"]],
        on="site",
        validate="many_to_one",
    )
    .assign(
        shift=lambda x: x[comparisons].apply(avg_method, axis=1),
        n_comparisons=lambda x: x[comparisons].notnull().sum(axis=1),
    )
)

print(f"Saving shifts to {shifts_csv}")
df.drop(columns=[c for c in addtl_site_cols if c != "sequential_site"]).to_csv(
    shifts_csv, index=False, float_format="%.4g"
)

Saving shifts to results/func_effect_shifts/averages/target_cell_ACE2_expression_comparison_shifts.csv

plot_kwargs = params["plot_kwargs"]

if "addtl_slider_stats" not in plot_kwargs:
    plot_kwargs["addtl_slider_stats"] = {}

if "times_seen" not in plot_kwargs["addtl_slider_stats"]:
    plot_kwargs["addtl_slider_stats"]["times_seen"] = times_seen

if "n_comparisons" not in plot_kwargs["addtl_slider_stats"]:
    plot_kwargs["addtl_slider_stats"]["n_comparisons"] = len(comparisons) // 2 + 1

if "addtl_tooltip_stats" not in plot_kwargs:
    plot_kwargs["addtl_tooltip_stats"] = []
for c in addtl_site_cols:
    if c not in plot_kwargs["addtl_tooltip_stats"]:
        plot_kwargs["addtl_tooltip_stats"].append(c)

if any(df["site"] != df["sequential_site"]):
    if "sequential_site" not in plot_kwargs["addtl_tooltip_stats"]:
        plot_kwargs["addtl_tooltip_stats"].append("sequential_site")

if params["per_comparison_tooltips"]:
    assert set(comparisons).issubset(df.columns)
    plot_kwargs["addtl_tooltip_stats"] += [
        c for c in comparisons if c not in plot_kwargs["addtl_tooltip_stats"]
    ]

if "alphabet" not in plot_kwargs:
    plot_kwargs["alphabet"] = [
        a
        for a in polyclonal.alphabets.biochem_order_aas(polyclonal.AAS_WITHSTOP_WITHGAP)
        if a in set(df["mutant"])
    ]

if "sites" not in plot_kwargs:
    plot_kwargs["sites"] = df.sort_values("sequential_site")["site"].unique().tolist()

chart = polyclonal.plot.lineplot_and_heatmap(
    data_df=df,
    stat_col="shift",
    category_col="condition",
    **plot_kwargs,
)

print(f"Saving chart to {shifts_html}")
chart.save(shifts_html)

chart

Saving chart to results/func_effect_shifts/averages/target_cell_ACE2_expression_comparison_shifts_nolegend.html

Average mutation functional effect shifts for a set of comparisons¶