import dms_variants.codonvarianttable

import matplotlib.pyplot as plt

import multidms

import pandas as pd

# this cell is tagged parameters for `papermill` parameterization
selection = None
func_scores = None
func_effects = None
global_epistasis_params = None
threads = None

# Parameters
global_epistasis_params = {
    "clip_lower": -4,
    "clip_upper": None,
    "collapse_identical_variants": False,
}
selection = "LibA-230825-CHO-bEFNB3"
func_scores = "results/func_scores/LibA-230825-CHO-bEFNB3_func_scores.csv"
func_effects = (
    "results/func_effects/by_selection/LibA-230825-CHO-bEFNB3_func_effects.csv"
)
threads = 1

func_scores_df = (
    pd.read_csv(func_scores, na_filter=None)
    .assign(condition=selection)
    .pipe(dms_variants.codonvarianttable.CodonVariantTable.classifyVariants)
)

median_stop = func_scores_df.query("variant_class == 'stop'")["func_score"].median()

for bound in ["upper", "lower"]:
    clip = global_epistasis_params[f"clip_{bound}"]
    if clip is None:
        print(f"No clipping on {bound} bound of functional scores")
    else:
        if clip == "median_stop":
            if pd.isnull(median_stop):
                raise ValueError(f"{median_stop=}")
            clip = median_stop
        assert isinstance(clip, (int, float)), clip
        print(f"Clipping {bound} bound of functional scores to {clip}")
        func_scores_df["func_score"] = func_scores_df["func_score"].clip(
            **{bound: clip}
        )

No clipping on upper bound of functional scores
Clipping lower bound of functional scores to -4

data = multidms.Data(
    variants_df=func_scores_df,
    reference=selection,
    alphabet=multidms.AAS_WITHSTOP_WITHGAP,
    collapse_identical_variants=global_epistasis_params["collapse_identical_variants"],
    letter_suffixed_sites=True,
    verbose=True,
    nb_workers=threads,
    assert_site_integrity=True,
)

inferring site map for LibA-230825-CHO-bEFNB3

Asserting site integrity

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

unknown cond wildtype at sites: [],
dropping: 0 variantswhich have mutations at those sites.

invalid non-identical-sites: [], dropping 0 variants
Converting mutations for LibA-230825-CHO-bEFNB3
is reference, skipping

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.

# initialize with default params, which give sigmoid global epistasis function
model = multidms.Model(data)

model.fit()

fig, ax = plt.subplots(1, 2, figsize=[8, 4])
model.plot_epistasis(ax=ax[1], alpha=0.1, show=False, legend=False)
model.plot_pred_accuracy(ax=ax[0], alpha=0.1, show=False, legend=False)
ax[1].set_title("Global epistasis fit")
ax[0].set_title("Training set accuracy")
plt.show()

fig, axes = plt.subplots(3, 1, figsize=[7, 8])
for times_seen, ax in zip([1, 3, 5], axes):
    model.plot_param_hist("beta", ax=ax, show=False, times_seen_threshold=times_seen)
    ax.legend()
    ax.set_title(
        f"Latent-phenotype effects of mutations with times_seen >= {times_seen}"
    )
plt.tight_layout()
plt.show()

mut_effects = (
    pd.concat(
        [
            # get mutant effects
            (
                model.get_mutations_df(phenotype_as_effect=True).rename(
                    columns={
                        f"times_seen_{selection}": "times_seen",
                        "wts": "wildtype",
                        "sites": "site",
                        "muts": "mutant",
                        f"{selection}_predicted_func_score": "functional_effect",
                        "beta": "latent_phenotype_effect",
                    }
                )
            ),
            # add wildtypes, which all have effects of 0
            pd.DataFrame(
                {
                    "site": data.site_map.index,
                    "wildtype": data.site_map[selection],
                    "mutant": data.site_map[selection],
                    "latent_phenotype_effect": 0,
                    "functional_effect": 0,
                }
            ),
        ],
    )
    .sort_values(["site", "mutant"])
    .reset_index(drop=True)
)

mut_effects

print(f"Writing the mutational effects to {func_effects}")

mut_effects.to_csv(func_effects, index=False, float_format="%.4g")

Writing the mutational effects to results/func_effects/by_selection/LibA-230825-CHO-bEFNB3_func_effects.csv

	wildtype	site	mutant	times_seen	latent_phenotype_effect	functional_effect
0	K	100	*	5.0	-3.148952	-2.759116
1	K	100	A	1.0	-1.089771	-0.850025
2	K	100	C	7.0	-5.670584	-3.453770
3	K	100	D	1.0	-2.640395	-2.387667
4	K	100	E	13.0	-6.098275	-3.479310
...	...	...	...	...	...	...
10690	D	99	S	7.0	-0.683719	-0.480872
10691	D	99	T	7.0	0.797084	0.341920
10692	D	99	V	6.0	-0.067509	-0.039187
10693	D	99	W	7.0	-0.498394	-0.332082
10694	D	99	Y	13.0	-0.670625	-0.469924

Fit global epistasis models to functional scores for each selection to get mutation functional effects¶