Convert escape and functional scores to polyclonal b factors for PDB viewing¶
In [1]:
# Imports
import os
import pandas as pd
import polyclonal
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
func_scores = None
pdb_file = None
filtered_escape_377H = None
filtered_escape_89F = None
filtered_escape_2510C = None
filtered_escape_121F = None
filtered_escape_256A = None
filtered_escape_372D = None
natural_sequence_variation = None
out_dir = None
pdb_func_min = None
pdb_func_max = None
pdb_377H = None
pdb_89F = None
pdb_2510C = None
pdb_121F = None
pdb_256A = None
pdb_372D = None
pdb_natural_variation = None
pdb_arevirumab = None
In [3]:
# Parameters
func_scores = "results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"
pdb_file = "data/7puy.pdb"
filtered_escape_377H = (
"results/filtered_antibody_escape_CSVs/377H_filtered_mut_effect.csv"
)
filtered_escape_89F = (
"results/filtered_antibody_escape_CSVs/89F_filtered_mut_effect.csv"
)
filtered_escape_2510C = (
"results/filtered_antibody_escape_CSVs/2510C_filtered_mut_effect.csv"
)
filtered_escape_121F = (
"results/filtered_antibody_escape_CSVs/121F_filtered_mut_effect.csv"
)
filtered_escape_256A = (
"results/filtered_antibody_escape_CSVs/256A_filtered_mut_effect.csv"
)
filtered_escape_372D = (
"results/filtered_antibody_escape_CSVs/372D_filtered_mut_effect.csv"
)
natural_sequence_variation = (
"non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
)
out_dir = "results/mapped_scores_onto_pdb/"
pdb_func_min = "results/mapped_scores_onto_pdb/func_scores_min.pdb"
pdb_func_max = "results/mapped_scores_onto_pdb/func_scores_max.pdb"
pdb_377H = "results/mapped_scores_onto_pdb/377H_escape.pdb"
pdb_89F = "results/mapped_scores_onto_pdb/89F_escape.pdb"
pdb_2510C = "results/mapped_scores_onto_pdb/2510C_escape.pdb"
pdb_121F = "results/mapped_scores_onto_pdb/121F_escape.pdb"
pdb_256A = "results/mapped_scores_onto_pdb/256A_escape.pdb"
pdb_372D = "results/mapped_scores_onto_pdb/372D_escape.pdb"
pdb_natural_variation = "results/mapped_scores_onto_pdb/natural_variation.pdb"
pdb_arevirumab = "results/mapped_scores_onto_pdb/arevirumab_escape.pdb"
In [4]:
# # Uncomment for running interactive
# func_scores = "../results/filtered_func_effect_CSVs/293T_filtered_func_effects.csv"
# pdb_file = "../data/7puy.pdb"
# filtered_escape_377H = "../results/filtered_antibody_escape_CSVs/377H_filtered_mut_effect.csv"
# filtered_escape_89F = "../results/filtered_antibody_escape_CSVs/89F_filtered_mut_effect.csv"
# filtered_escape_2510C = "../results/filtered_antibody_escape_CSVs/2510C_filtered_mut_effect.csv"
# filtered_escape_121F = "../results/filtered_antibody_escape_CSVs/121F_filtered_mut_effect.csv"
# filtered_escape_256A = "../results/filtered_antibody_escape_CSVs/256A_filtered_mut_effect.csv"
# filtered_escape_372D = "../results/filtered_antibody_escape_CSVs/372D_filtered_mut_effect.csv"
# natural_sequence_variation = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
# out_dir = "../results/mapped_scores_onto_pdb/"
# pdb_func_min = "../results/mapped_scores_onto_pdb/func_scores_min.pdb"
# pdb_func_max = "../results/mapped_scores_onto_pdb/func_scores_max.pdb"
# pdb_377H = "../results/mapped_scores_onto_pdb/377H_escape.pdb"
# pdb_89F = "../results/mapped_scores_onto_pdb/89F_escape.pdb"
# pdb_2510C = "../results/mapped_scores_onto_pdb/2510C_escape.pdb"
# pdb_121F = "../results/mapped_scores_onto_pdb/121F_escape.pdb"
# pdb_256A = "../results/mapped_scores_onto_pdb/256A_escape.pdb"
# pdb_372D = "../results/mapped_scores_onto_pdb/372D_escape.pdb"
# pdb_natural_variation = "../results/mapped_scores_onto_pdb/natural_variation.pdb"
# pdb_arevirumab = "../results/mapped_scores_onto_pdb/arevirumab_escape.pdb"
In [5]:
def natural_variation_to_b_factors(input_pdb_file, output_pdb_file, natural_variation_file):
"""
Function to map natural variation to a pdb structure
using mut_escape_pdb_b_factor function from polyclonal.
"""
natural_variation = pd.read_csv(natural_variation_file)
natural_variation_aA = natural_variation[["site", "entropy"]].copy()
natural_variation_bB = natural_variation[["site", "entropy"]].copy()
natural_variation_cC = natural_variation[["site", "entropy"]].copy()
natural_variation_aA["chain"] = (
natural_variation_aA.apply(lambda x: "A" if x["site"] <= 259 else "a", axis=1)
)
natural_variation_bB["chain"] = (
natural_variation_bB.apply(lambda x: "B" if x["site"] <= 259 else "b", axis=1)
)
natural_variation_cC["chain"] = (
natural_variation_cC.apply(lambda x: "C" if x["site"] <= 259 else "c", axis=1)
)
natural_variation = (
pd.concat([
natural_variation_aA,
natural_variation_bB,
natural_variation_cC,
], ignore_index=True)
)
natural_variation = natural_variation.astype({"site" : "int"})
print("Natural sequence variation:")
print(f"50th percentile of entropy: {natural_variation['entropy'].quantile(0.50)}")
print(f"75th percentile of entropy: {natural_variation['entropy'].quantile(0.75)}")
print(f"90th percentile of entropys: {natural_variation['entropy'].quantile(0.90)}")
print(f"Entropy score range: {natural_variation['entropy'].min()} to {natural_variation['entropy'].max()}")
print()
polyclonal.pdb_utils.reassign_b_factor(
input_pdbfile=input_pdb_file,
output_pdbfile=output_pdb_file,
df=natural_variation,
metric_col="entropy",
site_col="site",
chain_col="chain",
)
In [6]:
def functional_scores_to_b_factors(input_pdb_file, output_pdb_file, func_scores_file):
"""
Function to map funcitonal scores to a pdb structure
using mut_escape_pdb_b_factor function from polyclonal.
"""
functional_scores = pd.read_csv(func_scores_file)
print("Functional DMS scores:")
# Filter for minimum selections, times seen and no stop codons
functional_scores = (
functional_scores.query(
"mutant != '*'"
)
.drop(columns=["mutant", "times_seen", "wildtype"])
.groupby(["site"])
.aggregate({
"effect" : "mean"
})
.reset_index()
)
# Scores greater than 0
max_scores = functional_scores.loc[functional_scores["effect"] > 0]["site"].tolist()
print(f"Sites greater than 0: {max_scores}")
functional_scores_aA = functional_scores.copy()
functional_scores_bB = functional_scores.copy()
functional_scores_cC = functional_scores.copy()
functional_scores_aA["chain"] = (
functional_scores_aA.apply(lambda x: "A" if x["site"] <= 259 else "a", axis=1)
)
functional_scores_bB["chain"] = (
functional_scores_bB.apply(lambda x: "B" if x["site"] <= 259 else "b", axis=1)
)
functional_scores_cC["chain"] = (
functional_scores_cC.apply(lambda x: "C" if x["site"] <= 259 else "c", axis=1)
)
functional_scores = (
pd.concat([
functional_scores_aA,
functional_scores_bB,
functional_scores_cC,
], ignore_index=True)
)
functional_scores = functional_scores.astype({"site" : "int"})
print(f"50th percentile of averaged functional scores: {functional_scores['effect'].quantile(0.50)}")
print(f"Functional score range: {functional_scores['effect'].min()} to {functional_scores['effect'].max()}")
print()
polyclonal.pdb_utils.reassign_b_factor(
input_pdbfile=input_pdb_file,
output_pdbfile=output_pdb_file,
df=functional_scores,
metric_col="effect",
site_col="site",
chain_col="chain",
)
In [7]:
def escape_scores_to_b_factors(input_pdb_file, output_pdb_file, escape_scores_file):
"""
Function to map escape scores to a pdb structure
using mut_escape_pdb_b_factor function from polyclonal.
"""
escape_scores = pd.read_csv(escape_scores_file)
antibody_name = escape_scores_file.split("/")[-1].split("_")[0]
# Filter escape df for low functional score mutations
escape_scores = (
escape_scores
.query("poor_cell_entry == False")
.copy()
.reset_index(drop=True)
)
# Add dummy phenotype column
escape_scores["phenotype"] = "escape"
# Clip lower scores to 0
escape_scores["escape_median"] = escape_scores["escape_median"].clip(lower=0)
# Calculate site sums
escape_scores = (
escape_scores.groupby(["site"])
.aggregate({
"escape_median" : "sum"
})
.reset_index()
)
print(antibody_name)
print(f"Max summed escape: {escape_scores['escape_median'].max()}") # Verify max matches altair plots
print(f"50th percentile of summed escape scores: {escape_scores['escape_median'].quantile(0.50)}")
print(f"75th percentile of summed escape scores: {escape_scores['escape_median'].quantile(0.75)}")
print(f"90th percentile of summed escape scores: {escape_scores['escape_median'].quantile(0.90)}")
print(f"95th percentile of summed escape scores: {escape_scores['escape_median'].quantile(0.95)}")
print(f"99th percentile of summed escape scores: {escape_scores['escape_median'].quantile(0.99)}")
print()
escape_scores_aA = escape_scores.copy()
escape_scores_bB = escape_scores.copy()
escape_scores_cC = escape_scores.copy()
escape_scores_aA["chain"] = (
escape_scores_aA.apply(lambda x: "A" if x["site"] <= 259 else "a", axis=1)
)
escape_scores_bB["chain"] = (
escape_scores_bB.apply(lambda x: "B" if x["site"] <= 259 else "b", axis=1)
)
escape_scores_cC["chain"] = (
escape_scores_cC.apply(lambda x: "C" if x["site"] <= 259 else "c", axis=1)
)
escape_scores = (
pd.concat([
escape_scores_aA,
escape_scores_bB,
escape_scores_cC,
], ignore_index=True)
)
escape_scores = escape_scores.astype({"site" : "int"})
polyclonal.pdb_utils.reassign_b_factor(
input_pdbfile=input_pdb_file,
output_pdbfile=output_pdb_file,
df=escape_scores,
metric_col="escape_median",
site_col="site",
chain_col="chain",
)
In [8]:
# Make output dir if doesn't exist
if not os.path.exists(out_dir):
os.mkdir(out_dir)
# Natural variation (effective amino acids) mapped to pdb structure
natural_variation_to_b_factors(pdb_file, pdb_natural_variation, natural_sequence_variation)
# Functional scores mapped to pdb structure for scores less than and greater than 0
functional_scores_to_b_factors(pdb_file, pdb_func_min, func_scores)
functional_scores_to_b_factors(pdb_file, pdb_func_max, func_scores)
# Antibody files
antibodies = [
filtered_escape_2510C,
filtered_escape_121F,
filtered_escape_377H,
filtered_escape_256A,
filtered_escape_372D,
filtered_escape_89F,
]
output_files = [
pdb_2510C,
pdb_121F,
pdb_377H,
pdb_256A,
pdb_372D,
pdb_89F,
]
# Escape scores mapped to pdb structure
for index, antibody in enumerate(antibodies):
escape_scores_to_b_factors(pdb_file, output_files[index], antibody)
# Load arevirumab antibody files
escape_89F = pd.read_csv(filtered_escape_89F)
escape_121F = pd.read_csv(filtered_escape_121F)
escape_372D = pd.read_csv(filtered_escape_372D)
# Filter escape df for low functional score mutations
escape_89F = (
escape_89F
.query("poor_cell_entry == False")
.copy()
.reset_index(drop=True)
)
# Add dummy phenotype column
escape_89F["phenotype"] = "escape"
# Clip lower scores to 0
escape_89F["escape_median"] = escape_89F["escape_median"].clip(lower=0)
# Calculate site sums
escape_89F = (
escape_89F.groupby(["site"])
.aggregate({
"escape_median" : "sum"
})
.rename(columns={"escape_median" : "89F_escape"})
.reset_index()
)
# Filter escape df for low functional score mutations
escape_121F = (
escape_121F
.query("poor_cell_entry == False")
.copy()
.reset_index(drop=True)
)
# Add dummy phenotype column
escape_121F["phenotype"] = "escape"
# Clip lower scores to 0
escape_121F["escape_median"] = escape_121F["escape_median"].clip(lower=0)
# Calculate site sums
escape_121F = (
escape_121F.groupby(["site"])
.aggregate({
"escape_median" : "sum"
})
.rename(columns={"escape_median" : "121F_escape"})
.reset_index()
)
# Filter escape df for low functional score mutations
escape_372D = (
escape_372D
.query("poor_cell_entry == False")
.copy()
.reset_index(drop=True)
)
# Add dummy phenotype column
escape_372D["phenotype"] = "escape"
# Clip lower scores to 0
escape_372D["escape_median"] = escape_372D["escape_median"].clip(lower=0)
# Calculate site sums
escape_372D = (
escape_372D.groupby(["site"])
.aggregate({
"escape_median" : "sum"
})
.rename(columns={"escape_median" : "372D_escape"})
.reset_index()
)
# Merge frames
arevirumab_df = (
escape_89F.merge(
escape_121F,
how="left",
on="site",
validate="one_to_one",
)
)
arevirumab_df = (
arevirumab_df.merge(
escape_372D,
how="left",
on="site",
validate="one_to_one",
)
)
# calculate average site escape
arevirumab_df["average_escape"] = arevirumab_df[["89F_escape", "121F_escape", "372D_escape",]].mean(axis=1)
print("Arevirumab-3")
print(f"Max summed escape: {arevirumab_df['average_escape'].max()}") # Verify max matches altair plots
print(f'Top 25 escape sites averaged across antibodies: {sorted(arevirumab_df.nlargest(25, "average_escape", keep="all")["site"].tolist())}')
print(f'Top 15 escape sites for 12.1F: {sorted(arevirumab_df.nlargest(15, "121F_escape", keep="all")["site"].tolist())}')
print(f'Top 15 escape sites 8.9F: {sorted(arevirumab_df.nlargest(15, "89F_escape", keep="all")["site"].tolist())}')
print(f'Top 15 escape sites 37.2D: {sorted(arevirumab_df.nlargest(15, "372D_escape", keep="all")["site"].tolist())}')
print(f"50th percentile of summed escape scores: {arevirumab_df['average_escape'].quantile(0.50)}")
print(f"75th percentile of summed escape scores: {arevirumab_df['average_escape'].quantile(0.75)}")
print(f"90th percentile of summed escape scores: {arevirumab_df['average_escape'].quantile(0.90)}")
print(f"95th percentile of summed escape scores: {arevirumab_df['average_escape'].quantile(0.95)}")
print(f"99th percentile of summed escape scores: {arevirumab_df['average_escape'].quantile(0.99)}")
print()
arevirumab_df_aA = arevirumab_df.copy()
arevirumab_df_bB = arevirumab_df.copy()
arevirumab_df_cC = arevirumab_df.copy()
arevirumab_df_aA["chain"] = (
arevirumab_df_aA.apply(lambda x: "A" if x["site"] <= 259 else "a", axis=1)
)
arevirumab_df_bB["chain"] = (
arevirumab_df_bB.apply(lambda x: "B" if x["site"] <= 259 else "b", axis=1)
)
arevirumab_df_cC["chain"] = (
arevirumab_df_cC.apply(lambda x: "C" if x["site"] <= 259 else "c", axis=1)
)
arevirumab_df = (
pd.concat([
arevirumab_df_aA,
arevirumab_df_bB,
arevirumab_df_cC,
], ignore_index=True)
)
arevirumab_df = arevirumab_df.astype({"site" : "int"})
polyclonal.pdb_utils.reassign_b_factor(
input_pdbfile=pdb_file,
output_pdbfile=pdb_arevirumab,
df=arevirumab_df,
metric_col="average_escape",
site_col="site",
chain_col="chain",
)
Natural sequence variation: 50th percentile of entropy: 0.0 75th percentile of entropy: 0.0456061195273183 90th percentile of entropys: 0.3005920000818175 Entropy score range: -0.0 to 1.278653183606825
Functional DMS scores: Sites greater than 0: [101, 116, 125, 126, 128, 131, 149, 161] 50th percentile of averaged functional scores: -1.666466111111111 Functional score range: -3.9472499999999995 to 0.5542526315789473
Functional DMS scores: Sites greater than 0: [101, 116, 125, 126, 128, 131, 149, 161] 50th percentile of averaged functional scores: -1.666466111111111 Functional score range: -3.9472499999999995 to 0.5542526315789473
2510C Max summed escape: 60.9395753 50th percentile of summed escape scores: 0.7075149999999999 75th percentile of summed escape scores: 1.47419625 90th percentile of summed escape scores: 2.9885005000000002 95th percentile of summed escape scores: 4.57355068 99th percentile of summed escape scores: 8.63142925 121F Max summed escape: 25.224800000000002 50th percentile of summed escape scores: 0.560305 75th percentile of summed escape scores: 1.5247574999999998 90th percentile of summed escape scores: 3.6041516 95th percentile of summed escape scores: 7.266912 99th percentile of summed escape scores: 15.14485
377H Max summed escape: 77.929 50th percentile of summed escape scores: 1.0570673 75th percentile of summed escape scores: 2.0096051 90th percentile of summed escape scores: 3.5859749 95th percentile of summed escape scores: 4.5619049 99th percentile of summed escape scores: 13.246235840000034
256A Max summed escape: 41.118 50th percentile of summed escape scores: 0.3497 75th percentile of summed escape scores: 0.826659 90th percentile of summed escape scores: 1.5706776000000002 95th percentile of summed escape scores: 1.9977084 99th percentile of summed escape scores: 3.5179488599999997 372D Max summed escape: 14.438883 50th percentile of summed escape scores: 0.13061999999999999 75th percentile of summed escape scores: 0.42711161 90th percentile of summed escape scores: 0.9017537600000001 95th percentile of summed escape scores: 1.3170236399999968 99th percentile of summed escape scores: 3.4113973360000007
89F Max summed escape: 39.0564 50th percentile of summed escape scores: 0.646014 75th percentile of summed escape scores: 1.51272325 90th percentile of summed escape scores: 3.0387528 95th percentile of summed escape scores: 9.785661999999958 99th percentile of summed escape scores: 22.984155000000005
Arevirumab-3 Max summed escape: 15.862030000000003 Top 25 escape sites averaged across antibodies: [92, 111, 117, 119, 120, 121, 123, 124, 125, 127, 129, 134, 135, 138, 141, 147, 148, 149, 150, 153, 160, 248, 253, 254, 395] Top 15 escape sites for 12.1F: [91, 92, 111, 127, 132, 134, 135, 139, 148, 149, 160, 248, 253, 334, 338] Top 15 escape sites 8.9F: [115, 117, 119, 121, 123, 124, 125, 129, 135, 138, 141, 147, 148, 149, 150] Top 15 escape sites 37.2D: [99, 123, 129, 135, 142, 149, 156, 169, 177, 254, 339, 343, 395, 397, 398] 50th percentile of summed escape scores: 0.5348417166666667 75th percentile of summed escape scores: 1.1892281666666666 90th percentile of summed escape scores: 3.1727534000000004 95th percentile of summed escape scores: 5.58291804999999 99th percentile of summed escape scores: 10.838794915433343