import dms_variants.codonvarianttable

import matplotlib.pyplot as plt

import multidms

import pandas as pd

# this cell is tagged parameters for `papermill` parameterization
selection = None
func_scores = None
func_effects = None
global_epistasis_params = None
threads = None

# Parameters
global_epistasis_params = {
    "clip_lower": -4,
    "clip_upper": None,
    "collapse_identical_variants": False,
}
selection = "LibB-231105-CHO-EFNB2-BA6-nac_diffVSV"
func_scores = (
    "results/func_scores/LibB-231105-CHO-EFNB2-BA6-nac_diffVSV_func_scores.csv"
)
func_effects = "results/func_effects/by_selection/LibB-231105-CHO-EFNB2-BA6-nac_diffVSV_func_effects.csv"
threads = 1

func_scores_df = (
    pd.read_csv(func_scores, na_filter=None)
    .assign(condition=selection)
    .pipe(dms_variants.codonvarianttable.CodonVariantTable.classifyVariants)
)

median_stop = func_scores_df.query("variant_class == 'stop'")["func_score"].median()

for bound in ["upper", "lower"]:
    clip = global_epistasis_params[f"clip_{bound}"]
    if clip is None:
        print(f"No clipping on {bound} bound of functional scores")
    else:
        if clip == "median_stop":
            if pd.isnull(median_stop):
                raise ValueError(f"{median_stop=}")
            clip = median_stop
        assert isinstance(clip, (int, float)), clip
        print(f"Clipping {bound} bound of functional scores to {clip}")
        func_scores_df["func_score"] = func_scores_df["func_score"].clip(
            **{bound: clip}
        )

No clipping on upper bound of functional scores
Clipping lower bound of functional scores to -4

data = multidms.Data(
    variants_df=func_scores_df,
    reference=selection,
    alphabet=multidms.AAS_WITHSTOP_WITHGAP,
    collapse_identical_variants=global_epistasis_params["collapse_identical_variants"],
    letter_suffixed_sites=True,
    verbose=True,
    nb_workers=threads,
    assert_site_integrity=True,
)

inferring site map for LibB-231105-CHO-EFNB2-BA6-nac_diffVSV

Asserting site integrity

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

unknown cond wildtype at sites: [],
dropping: 0 variantswhich have mutations at those sites.

invalid non-identical-sites: [], dropping 0 variants
Converting mutations for LibB-231105-CHO-EFNB2-BA6-nac_diffVSV
is reference, skipping

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.

# initialize with default params, which give sigmoid global epistasis function
model = multidms.Model(data)

model.fit()

fig, ax = plt.subplots(1, 2, figsize=[8, 4])
model.plot_epistasis(ax=ax[1], alpha=0.1, show=False, legend=False)
model.plot_pred_accuracy(ax=ax[0], alpha=0.1, show=False, legend=False)
ax[1].set_title("Global epistasis fit")
ax[0].set_title("Training set accuracy")
plt.show()

fig, axes = plt.subplots(3, 1, figsize=[7, 8])
for times_seen, ax in zip([1, 3, 5], axes):
    model.plot_param_hist("beta", ax=ax, show=False, times_seen_threshold=times_seen)
    ax.legend()
    ax.set_title(
        f"Latent-phenotype effects of mutations with times_seen >= {times_seen}"
    )
plt.tight_layout()
plt.show()

mut_effects = (
    pd.concat(
        [
            # get mutant effects
            (
                model.get_mutations_df(phenotype_as_effect=True).rename(
                    columns={
                        f"times_seen_{selection}": "times_seen",
                        "wts": "wildtype",
                        "sites": "site",
                        "muts": "mutant",
                        f"{selection}_predicted_func_score": "functional_effect",
                        "beta": "latent_phenotype_effect",
                    }
                )
            ),
            # add wildtypes, which all have effects of 0
            pd.DataFrame(
                {
                    "site": data.site_map.index,
                    "wildtype": data.site_map[selection],
                    "mutant": data.site_map[selection],
                    "latent_phenotype_effect": 0,
                    "functional_effect": 0,
                }
            ),
        ],
    )
    .sort_values(["site", "mutant"])
    .reset_index(drop=True)
)

mut_effects

print(f"Writing the mutational effects to {func_effects}")

mut_effects.to_csv(func_effects, index=False, float_format="%.4g")

Writing the mutational effects to results/func_effects/by_selection/LibB-231105-CHO-EFNB2-BA6-nac_diffVSV_func_effects.csv

	wildtype	site	mutant	times_seen	latent_phenotype_effect	functional_effect
0	K	100	*	3.0	-1.386369	-1.091477
1	K	100	A	7.0	-1.287385	-0.994191
2	K	100	C	4.0	-6.292242	-3.380415
3	K	100	D	2.0	-3.468647	-2.836495
4	K	100	E	11.0	-5.515200	-3.334269
...	...	...	...	...	...	...
10691	D	99	S	10.0	0.713634	0.296152
10692	D	99	T	5.0	0.239679	0.117785
10693	D	99	V	4.0	0.989806	0.372246
10694	D	99	W	9.0	1.313109	0.440907
10695	D	99	Y	15.0	-1.045272	-0.764948

Fit global epistasis models to functional scores for each selection to get mutation functional effects¶