Validation neutralization assays versus polyclonal fits¶

Compare actual measured neutralization values for specific mutants to the polyclonal fits.

Import Python modules:

In [1]:
import os
import pickle

import altair as alt

import pandas as pd
import numpy as np

import yaml

from scipy import stats

import warnings
warnings.simplefilter("ignore")

palette = ['#999999', '#0072B2',  '#E69F00', '#F0E442', '#009E73','#56B4E9', "#D55E00", "#CC79A7"] 

extended_palette = ['#999999', '#0072B2',  '#E69F00', '#F0E442', '#009E73','#56B4E9', "#D55E00", "#CC79A7", '#9F0162'] 

long_palette = ['#999999', '#9F0162', '#009F81', '#FF5AAF', '#8400CD', '#008DF9', '#00C2F9', '#FFB2FD', '#A40122', '#E20134', '#FF6E3A', '#FFC33B', '#00FCCF']

figure_palette = ['#999999', '#0072B2', '#F0E442', '#E69F00', '#009E73','#56B4E9', "#D55E00", "#CC79A7", '#9F0162','#8400CD']

Read configuration and validation assay measurements:

In [2]:
with open("config.yaml") as f:
    config = yaml.safe_load(f)
    
validation_ic50s = pd.read_csv('data/V3_validation_ICs.csv', na_filter=None)

Now get the predictions by the averaged polyclonal model fits:

In [3]:
validation_vs_prediction = []
for virus, virus_df in validation_ic50s.groupby("virus"):
    if virus == 'TRO11':
        virus_data_path = 'results/antibody_escape/averages/'
    elif virus == 'BF520':
        virus_data_path = '../HIV_Envelope_BF520_DMS/results/antibody_escape/averages/'
    for antibody, antibody_df in virus_df.groupby("antibody"):
        with open(os.path.join(virus_data_path, f"{antibody}_polyclonal_model.pickle"), "rb") as f:
            model = pickle.load(f)
        df = model.mut_icXX_df_w_model_values(x=.5, icXX_col='IC50', log_fold_change_icXX_col='log_fold_change_IC50')
        df['virus'] = virus
        df['antibody'] = antibody
        validation_vs_prediction.append(df)
    
validation_vs_prediction = pd.concat(validation_vs_prediction, ignore_index=True)
In [4]:
validation_vs_prediction['aa_substitutions'] = validation_vs_prediction['wildtype'] + validation_vs_prediction['site'] + validation_vs_prediction['mutant']
mutations = validation_ic50s['aa_substitutions'].unique()
plot_data = validation_ic50s.merge(
    (validation_vs_prediction), how='left', on=['aa_substitutions', 'virus', 'antibody'],
)
plot_data.loc[plot_data['aa_substitutions']=='', 'log_fold_change_IC50 mean'] = 0
plot_data.query('times_seen>=2').dropna(subset=['log_fold_change_IC50 mean'])
Out[4]:
antibody virus aa_substitutions measured IC50 Env region site wildtype mutant log_fold_change_IC50 mean log_fold_change_IC50 median ... B-231011-rescue_6_ultra-007-1 A-230906-rescue_5-10-1074-1 B-231011-rescue_6_ultra-10-1074-1 B-231011-rescue_6_ultra-10-1074-2 A-250107-rescue_8-BG18-1 B-241206-rescue_8-BG18-1 A-240223-rescue_6-PGT121-1 B-241206-rescue_8-PGT121-1 A-240223-rescue_6-PGT128-1 B-241206-rescue_8-PGT128-1
1 007 TRO11 N156K 25.000 V1 loop 156 N K 1.577218 1.577218 ... 2.050126 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 007 TRO11 N156R 14.193 V1 loop 156 N R 1.663352 1.663352 ... 1.705874 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 007 TRO11 T189G 0.060 V2 loop 189 T G 1.047459 1.047459 ... 0.664836 NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 007 TRO11 T297E 0.002 V3 loop 297 T E 0.327464 0.327464 ... 0.620463 NaN NaN NaN NaN NaN NaN NaN NaN NaN
7 007 TRO11 T303G 25.000 V3 loop 303 T G 2.048955 2.048955 ... 2.419049 NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
110 BG18 TRO11 N332V 25.000 N332 glycan 332 N V 5.452237 5.452237 ... NaN NaN NaN NaN 9.417413 1.487060 NaN NaN NaN NaN
111 BG18 TRO11 S334Q 25.000 N332 glycan 334 S Q 4.991484 4.991484 ... NaN NaN NaN NaN 7.789030 2.193938 NaN NaN NaN NaN
112 BG18 TRO11 S334R 6.167 N332 glycan 334 S R 5.327832 5.327832 ... NaN NaN NaN NaN 8.042642 2.613022 NaN NaN NaN NaN
113 BG18 TRO11 S375W 0.014 Site 375 375 S W 0.002911 0.002911 ... NaN NaN NaN NaN 0.003957 0.001866 NaN NaN NaN NaN
114 BG18 TRO11 G441I 0.006 Site 441 441 G I 0.199924 0.199924 ... NaN NaN NaN NaN -0.005319 0.405167 NaN NaN NaN NaN

100 rows × 26 columns

In [5]:
fold_changes = (
    plot_data
    .rename(columns={"log_fold_change_IC50 mean": "predicted log fold change IC50"})
    [["antibody",
      "virus",
      "aa_substitutions", 
      "measured IC50",
      "predicted log fold change IC50", 
      "times_seen", 
      "n_models",
      "Env region"]]
    .merge(
        plot_data
        .rename(columns={"log_fold_change_IC50 mean": "predicted log fold change IC50"})
        .query("aa_substitutions == ''")
        [["antibody", "virus", "measured IC50", "predicted log fold change IC50"]],
        on=["antibody", "virus"],
        how="left",
        suffixes=[" mutant", " unmutated"],
    )
    .assign(
        measured_fold_change=lambda x: x["measured IC50 mutant"] / x["measured IC50 unmutated"],
    )
)
fold_changes = fold_changes.dropna(subset=['predicted log fold change IC50 mutant'])
In [6]:
#display(plot_data)
for virus in fold_changes['virus'].unique():
    for antibody in fold_changes['antibody'].unique():
        sub_plot_data = fold_changes.query('virus==@virus').query('antibody==@antibody').query('times_seen>=2').copy()
        sub_plot_data['aa_substitutions'] = [f'wildtype {virus}' if x is '' else x for x in sub_plot_data['aa_substitutions']]
        sub_plot_data['log measured_fold_change'] = np.log(sub_plot_data['measured_fold_change'])
        fold_change_chart = (
            alt.Chart(sub_plot_data.query('virus==@virus').query('antibody==@antibody'))
            .encode(
                x=alt.X(
                    "log measured_fold_change",
                    title=["Traditional neutralization assay", "measured log fold change IC50"],
                    scale=alt.Scale(#type="log", 
                                        nice=False,
                                       domain=(sub_plot_data["log measured_fold_change"].min(), 
                                           sub_plot_data["log measured_fold_change"].max()*1.25)),
                       ),
                y=alt.Y(
                    "predicted log fold change IC50 mutant",
                    title=["Deep mutational scanning", "measured log fold change IC50"],
                    scale=alt.Scale(#type="log", 
                                    nice=False,
                                   domain=(sub_plot_data["predicted log fold change IC50 mutant"].min(), 
                                           sub_plot_data["predicted log fold change IC50 mutant"].max()*1.25)),
                ),
                #facet=alt.Facet("antibody", columns=4, title=None),
                color=alt.Color("Env region", 
                                #title="Amino acid substitutions", 
                                scale=alt.Scale(range=figure_palette[1:]),
                                sort=[
                                    'wildtype TRO11',
                                ]
                               ),
                tooltip=[
                    alt.Tooltip(c, format=".3g") if sub_plot_data[c].dtype == float
                    else c
                    for c in sub_plot_data.columns.tolist()
                ],
            )
            .mark_circle(filled=True, size=100, opacity=1)
            #.configure_axis(grid=False)
            #.resolve_scale(y="independent", x="independent")
            .properties(width=150, height=150)
        )
        
        antibody_df = fold_changes.query("antibody==@antibody").query('virus==@virus')
        antibody_df = antibody_df[~antibody_df['aa_substitutions'].str.contains(" ")]
        display(antibody_df)
        print(f"{antibody}:")
        slope, intercept, r_value, p_value, std_err = stats.linregress(
            antibody_df["predicted log fold change IC50 mutant"].astype(float),
            np.log(antibody_df["measured_fold_change"].astype(float)))
        print(f"Predicted fold change correlation (R^2): {round(r_value**2,3)}")
        
        line = alt.Chart(pd.DataFrame({'log measured_fold_change': [sub_plot_data["log measured_fold_change"].max()]})).mark_rule(strokeDash=[8,8]).encode(x='log measured_fold_change')
        (fold_change_chart + line).configure_axis(grid=False).display()
antibody virus aa_substitutions measured IC50 mutant predicted log fold change IC50 mutant times_seen n_models Env region measured IC50 unmutated predicted log fold change IC50 unmutated measured_fold_change
0 007 TRO11 0.009 0.000000 NaN NaN 0.009 0.0 1.000000
1 007 TRO11 N156K 25.000 1.577218 11.583333 2.0 V1 loop 0.009 0.0 2777.777778
2 007 TRO11 N156R 14.193 1.663352 5.666667 2.0 V1 loop 0.009 0.0 1577.000000
3 007 TRO11 T189G 0.060 1.047459 4.750000 2.0 V2 loop 0.009 0.0 6.666667
4 007 TRO11 T297E 0.002 0.327464 2.500000 2.0 V3 loop 0.009 0.0 0.222222
7 007 TRO11 T303G 25.000 2.048955 3.000000 2.0 V3 loop 0.009 0.0 2777.777778
8 007 TRO11 T303K 25.000 1.847868 8.916667 2.0 V3 loop 0.009 0.0 2777.777778
9 007 TRO11 D322K 25.000 1.556746 7.166667 2.0 V3 loop 0.009 0.0 2777.777778
10 007 TRO11 G324D 0.030 0.184986 10.166667 2.0 V3 loop 0.009 0.0 3.333333
11 007 TRO11 G324V 25.000 1.019753 4.000000 2.0 V3 loop 0.009 0.0 2777.777778
12 007 TRO11 D325Q 0.004 -0.136994 4.166667 2.0 V3 loop 0.009 0.0 0.444444
13 007 TRO11 R327L 0.412 1.386504 7.166667 2.0 V3 loop 0.009 0.0 45.777778
14 007 TRO11 R327P 11.418 -0.155738 4.000000 2.0 V3 loop 0.009 0.0 1268.666667
15 007 TRO11 H330D 0.004 0.593200 6.000000 2.0 V3 loop 0.009 0.0 0.444444
16 007 TRO11 H330E 0.012 0.433379 7.000000 2.0 V3 loop 0.009 0.0 1.333333
17 007 TRO11 N332K 0.003 0.122872 10.000000 2.0 N332 glycan 0.009 0.0 0.333333
18 007 TRO11 N332V 0.004 -0.167978 4.500000 2.0 N332 glycan 0.009 0.0 0.444444
19 007 TRO11 S334Q 0.005 -0.134992 2.500000 2.0 N332 glycan 0.009 0.0 0.555556
20 007 TRO11 S334R 0.005 -0.023902 7.000000 2.0 N332 glycan 0.009 0.0 0.555556
21 007 TRO11 S375W 0.049 1.295022 4.166667 2.0 Site 375 0.009 0.0 5.444444
22 007 TRO11 G441I 0.685 0.603175 5.500000 2.0 Site 441 0.009 0.0 76.111111
007:
Predicted fold change correlation (R^2): 0.556
antibody virus aa_substitutions measured IC50 mutant predicted log fold change IC50 mutant times_seen n_models Env region measured IC50 unmutated predicted log fold change IC50 unmutated measured_fold_change
23 10-1074 TRO11 0.020 0.000000 NaN NaN 0.02 0.0 1.00
24 10-1074 TRO11 N156K 0.038 0.156583 12.166667 3.0 V1 loop 0.02 0.0 1.90
25 10-1074 TRO11 N156R 0.020 0.098729 4.500000 3.0 V1 loop 0.02 0.0 1.00
26 10-1074 TRO11 T189G 0.020 -0.101894 4.111111 3.0 V2 loop 0.02 0.0 1.00
27 10-1074 TRO11 T297E 0.014 0.068631 2.000000 3.0 V3 loop 0.02 0.0 0.70
30 10-1074 TRO11 T303G 0.016 -0.024410 2.750000 3.0 V3 loop 0.02 0.0 0.80
31 10-1074 TRO11 T303K 0.015 -0.108850 8.000000 3.0 V3 loop 0.02 0.0 0.75
32 10-1074 TRO11 D322K 0.034 -0.227688 7.416667 3.0 V3 loop 0.02 0.0 1.70
33 10-1074 TRO11 G324D 0.035 -0.190156 9.083333 3.0 V3 loop 0.02 0.0 1.75
34 10-1074 TRO11 G324V 0.001 0.465700 3.750000 3.0 V3 loop 0.02 0.0 0.05
35 10-1074 TRO11 D325Q 1.653 0.665581 3.250000 3.0 V3 loop 0.02 0.0 82.65
36 10-1074 TRO11 R327L 0.043 0.296977 7.027778 3.0 V3 loop 0.02 0.0 2.15
37 10-1074 TRO11 R327P 1.154 1.670161 4.388889 3.0 V3 loop 0.02 0.0 57.70
38 10-1074 TRO11 H330D 0.060 1.926727 6.333333 3.0 V3 loop 0.02 0.0 3.00
39 10-1074 TRO11 H330E 0.063 3.183692 7.111111 3.0 V3 loop 0.02 0.0 3.15
40 10-1074 TRO11 N332K 25.000 4.579166 12.305556 3.0 N332 glycan 0.02 0.0 1250.00
41 10-1074 TRO11 N332V 25.000 4.891890 5.305556 3.0 N332 glycan 0.02 0.0 1250.00
42 10-1074 TRO11 S334Q 25.000 5.054927 3.416667 3.0 N332 glycan 0.02 0.0 1250.00
43 10-1074 TRO11 S334R 25.000 4.013218 8.722222 3.0 N332 glycan 0.02 0.0 1250.00
44 10-1074 TRO11 S375W 0.074 0.427159 3.333333 3.0 Site 375 0.02 0.0 3.70
45 10-1074 TRO11 G441I 0.039 -0.103308 5.083333 3.0 Site 441 0.02 0.0 1.95
10-1074:
Predicted fold change correlation (R^2): 0.734
antibody virus aa_substitutions measured IC50 mutant predicted log fold change IC50 mutant times_seen n_models Env region measured IC50 unmutated predicted log fold change IC50 unmutated measured_fold_change
46 PGT121 TRO11 0.0191 0.000000 NaN NaN 0.0191 0.0 1.000000
47 PGT121 TRO11 N156K 0.0220 0.120422 11.125 2.0 V1 loop 0.0191 0.0 1.151832
48 PGT121 TRO11 N156R 0.0120 -0.025521 6.500 2.0 V1 loop 0.0191 0.0 0.628272
49 PGT121 TRO11 T189G 0.0150 0.002666 5.500 2.0 V2 loop 0.0191 0.0 0.785340
50 PGT121 TRO11 T297E 0.0060 -0.040831 3.000 2.0 V3 loop 0.0191 0.0 0.314136
53 PGT121 TRO11 T303G 0.0090 0.152034 3.500 2.0 V3 loop 0.0191 0.0 0.471204
54 PGT121 TRO11 T303K 0.0060 -0.002799 10.125 2.0 V3 loop 0.0191 0.0 0.314136
55 PGT121 TRO11 D322K 0.0240 0.133515 6.125 2.0 V3 loop 0.0191 0.0 1.256545
56 PGT121 TRO11 G324D 3.4920 2.440812 9.500 2.0 V3 loop 0.0191 0.0 182.827225
57 PGT121 TRO11 G324V 25.0000 2.261605 6.500 2.0 V3 loop 0.0191 0.0 1308.900524
58 PGT121 TRO11 D325Q 2.4250 2.878719 4.125 2.0 V3 loop 0.0191 0.0 126.963351
59 PGT121 TRO11 R327L 0.0290 0.260262 6.500 2.0 V3 loop 0.0191 0.0 1.518325
60 PGT121 TRO11 R327P 25.0000 2.424695 5.000 2.0 V3 loop 0.0191 0.0 1308.900524
61 PGT121 TRO11 H330D 25.0000 3.837065 6.250 2.0 V3 loop 0.0191 0.0 1308.900524
62 PGT121 TRO11 H330E 0.0520 3.458231 7.750 2.0 V3 loop 0.0191 0.0 2.722513
63 PGT121 TRO11 N332K 25.0000 4.675454 11.875 2.0 N332 glycan 0.0191 0.0 1308.900524
64 PGT121 TRO11 N332V 11.4420 2.966199 6.750 2.0 N332 glycan 0.0191 0.0 599.057592
65 PGT121 TRO11 S334Q 18.7690 3.257323 3.000 2.0 N332 glycan 0.0191 0.0 982.670157
66 PGT121 TRO11 S334R 11.9170 3.793309 8.250 2.0 N332 glycan 0.0191 0.0 623.926702
67 PGT121 TRO11 S375W 0.0600 0.215050 4.500 2.0 Site 375 0.0191 0.0 3.141361
68 PGT121 TRO11 G441I 0.0230 -0.171324 4.500 2.0 Site 441 0.0191 0.0 1.204188
PGT121:
Predicted fold change correlation (R^2): 0.768
antibody virus aa_substitutions measured IC50 mutant predicted log fold change IC50 mutant times_seen n_models Env region measured IC50 unmutated predicted log fold change IC50 unmutated measured_fold_change
69 PGT128 TRO11 0.016676 0.000000 NaN NaN 0.016676 0.0 1.000000
70 PGT128 TRO11 N156K 0.036000 0.155409 11.166667 2.0 V1 loop 0.016676 0.0 2.158791
71 PGT128 TRO11 N156R 0.016000 -0.114363 6.500000 2.0 V1 loop 0.016676 0.0 0.959463
72 PGT128 TRO11 T189G 0.021000 -0.096800 5.500000 2.0 V2 loop 0.016676 0.0 1.259295
73 PGT128 TRO11 T297E 0.745000 1.261306 3.000000 2.0 V3 loop 0.016676 0.0 44.674982
76 PGT128 TRO11 T303G 25.000000 2.130205 3.500000 2.0 V3 loop 0.016676 0.0 1499.160470
77 PGT128 TRO11 T303K 1.572000 2.298899 10.666667 2.0 V3 loop 0.016676 0.0 94.267210
78 PGT128 TRO11 D322K 0.045000 0.162696 6.000000 2.0 V3 loop 0.016676 0.0 2.698489
79 PGT128 TRO11 G324D 25.000000 2.152204 9.583333 2.0 V3 loop 0.016676 0.0 1499.160470
80 PGT128 TRO11 G324V 25.000000 0.855511 5.416667 2.0 V3 loop 0.016676 0.0 1499.160470
81 PGT128 TRO11 D325Q 0.012000 0.267007 3.250000 2.0 V3 loop 0.016676 0.0 0.719597
82 PGT128 TRO11 R327L 0.106000 0.974874 6.500000 2.0 V3 loop 0.016676 0.0 6.356440
83 PGT128 TRO11 R327P 25.000000 1.081601 4.833333 2.0 V3 loop 0.016676 0.0 1499.160470
84 PGT128 TRO11 H330D 0.287000 2.298389 6.250000 2.0 V3 loop 0.016676 0.0 17.210362
85 PGT128 TRO11 H330E 1.371000 1.357455 6.666667 2.0 V3 loop 0.016676 0.0 82.213960
86 PGT128 TRO11 N332K 25.000000 2.929567 10.833333 2.0 N332 glycan 0.016676 0.0 1499.160470
87 PGT128 TRO11 N332V 25.000000 2.424858 6.666667 2.0 N332 glycan 0.016676 0.0 1499.160470
88 PGT128 TRO11 S334Q 25.000000 -0.708621 3.000000 2.0 N332 glycan 0.016676 0.0 1499.160470
89 PGT128 TRO11 S334R 13.084000 2.481485 7.500000 2.0 N332 glycan 0.016676 0.0 784.600624
90 PGT128 TRO11 S375W 0.139000 0.662335 4.500000 2.0 Site 375 0.016676 0.0 8.335332
91 PGT128 TRO11 G441I 0.044000 -0.203721 4.500000 2.0 Site 441 0.016676 0.0 2.638522
PGT128:
Predicted fold change correlation (R^2): 0.388
antibody virus aa_substitutions measured IC50 mutant predicted log fold change IC50 mutant times_seen n_models Env region measured IC50 unmutated predicted log fold change IC50 unmutated measured_fold_change
92 BG18 TRO11 0.001 0.000000 NaN NaN 0.001 0.0 1.0
93 BG18 TRO11 N156K 0.008 0.061903 12.0 2.0 V1 loop 0.001 0.0 8.0
94 BG18 TRO11 N156R 0.003 -0.000933 7.5 2.0 V1 loop 0.001 0.0 3.0
95 BG18 TRO11 T189G 0.002 -0.015008 6.0 2.0 V2 loop 0.001 0.0 2.0
96 BG18 TRO11 T297E 0.001 -0.001814 2.5 2.0 V3 loop 0.001 0.0 1.0
99 BG18 TRO11 T303G 0.003 0.008623 3.5 2.0 V3 loop 0.001 0.0 3.0
100 BG18 TRO11 T303K 0.002 -0.071947 11.5 2.0 V3 loop 0.001 0.0 2.0
101 BG18 TRO11 D322K 0.005 -0.004240 7.0 2.0 V3 loop 0.001 0.0 5.0
102 BG18 TRO11 G324D 0.006 0.036050 11.0 2.0 V3 loop 0.001 0.0 6.0
103 BG18 TRO11 G324V 0.001 0.455168 8.0 2.0 V3 loop 0.001 0.0 1.0
104 BG18 TRO11 D325Q 0.002 -0.022207 4.0 2.0 V3 loop 0.001 0.0 2.0
105 BG18 TRO11 R327L 0.002 -0.012135 7.0 2.0 V3 loop 0.001 0.0 2.0
106 BG18 TRO11 R327P 0.006 0.245125 5.5 2.0 V3 loop 0.001 0.0 6.0
107 BG18 TRO11 H330D 0.005 0.786674 6.5 2.0 V3 loop 0.001 0.0 5.0
108 BG18 TRO11 H330E 0.009 5.209427 8.0 2.0 V3 loop 0.001 0.0 9.0
109 BG18 TRO11 N332K 25.000 4.333063 14.0 2.0 N332 glycan 0.001 0.0 25000.0
110 BG18 TRO11 N332V 25.000 5.452237 7.0 2.0 N332 glycan 0.001 0.0 25000.0
111 BG18 TRO11 S334Q 25.000 4.991484 3.0 2.0 N332 glycan 0.001 0.0 25000.0
112 BG18 TRO11 S334R 6.167 5.327832 8.5 2.0 N332 glycan 0.001 0.0 6167.0
113 BG18 TRO11 S375W 0.014 0.002911 4.5 2.0 Site 375 0.001 0.0 14.0
114 BG18 TRO11 G441I 0.006 0.199924 5.0 2.0 Site 441 0.001 0.0 6.0
BG18:
Predicted fold change correlation (R^2): 0.745
In [ ]: