import os

import altair as alt
import numpy as np
import pandas as pd

import matplotlib
matplotlib.rcParams['svg.fonttype'] = 'none'
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import dmslogo
from dmslogo.colorschemes import ValueToColorMap

# Make output dir if doesn't exist
if not os.path.exists('./results/logoplots/'):
    os.mkdir('./results/logoplots/')

#here are filtering parameters
sums_threshold = 0.99 #will include top X% of summed escape sites
max_threshold = 0.99 #will include top X% of max escape sites

escape_std_cutoff = 5 #only antibody escape mutants below this escape_std will be included
func_times_seen_cutoff = 3 #how many times a unique barcoded mutation was observed in functional selections
antibody_times_seen = 3 #how many times a unique barcoded mutation was observed in antibody escape

ab_dfs = {'007': './results/antibody_escape/averages/007_mut_effect.csv',
          '10-1074': './results/antibody_escape/averages/10-1074_mut_effect.csv',
          'PGT121': './results/antibody_escape/averages/PGT121_mut_effect.csv',
          'PGT128': './results/antibody_escape/averages/PGT128_mut_effect.csv',
          'BG18': './results/antibody_escape/averages/BG18_mut_effect.csv',
         }
func_scores = pd.read_csv('./results/func_effects/averages/TZM-bl_entry_func_effects.csv')
site_numbering_map = pd.read_csv('./data/site_numbering_map.csv')
site_numbering_map['site']=site_numbering_map['reference_site']

processed_dfs = {}
for ab in ab_dfs:
    ab_df = pd.read_csv(ab_dfs[ab])
    # Merge mAb escape file with functional_scores
    processed_df = ab_df.merge(func_scores, on=['site','wildtype','mutant'],how='left',suffixes=['_ab','_func_effects'])
    processed_df = processed_df.merge(site_numbering_map[['site', 'sequential_site']], on=['site'])
    # Filter the dataframes based on filtering parameters
    processed_df = (processed_df
                    .query('mutant!="*"')
                    .query('mutant!="-"')
                    #.query('effect>-4.5')
                    .query('times_seen_func_effects>=@func_times_seen_cutoff')
                    .query('times_seen_ab>=@antibody_times_seen')
                   )
    processed_dfs[ab] = processed_df

all_sites_to_show = []
for ab in processed_dfs:
    processed_df = processed_dfs[ab]
    # Get sites with max score over some percentile threshold
    sums_max = processed_df.groupby('site')['escape_median'].max()
    quantile_max = sums_max.quantile(sums_threshold)
    sites_max_score = sums_max[sums_max > quantile_max].index.tolist()
    for site in sites_max_score:
        if site not in all_sites_to_show:
            all_sites_to_show.append(site)
    
    # Get sites with sum score over percentile threshold
    sums_total = processed_df.groupby('site')['escape_median'].sum()
    quantile_total = sums_total.quantile(max_threshold)
    sites_agg_sum = sums_total[sums_total > quantile_total].index.tolist()
    for site in sites_agg_sum:
        if site not in all_sites_to_show:
            all_sites_to_show.append(site)
print(sorted(all_sites_to_show))
# Curated list of sites that includes sites closeby: 
all_sites_to_show = ['79', '136', '137', '138', '139', '140', '141', '142', '151', '160', '161',
                     '162', '186', '187', '188', '197', '198', '199', '299', 
                     '301', '302', '303', '319', '320', '321', '322', '323', '323a', '324', '325', '326',
                     '327', '328', '329', '330', '331', '332', '333', '334', '375']
print(all_sites_to_show)

['115', '136', '137', '138', '139', '140', '142', '151', '160', '161', '162', '177', '178', '182', '186', '188', '199', '274', '299', '302', '303', '319', '325', '328', '330', '332', '334', '339', '347', '374', '415', '419', '423', '430', '436', '442', '455', '462', '487', '509', '518', '557', '559', '56', '565', '61', '674', '680', '690', '700', '75', '77']
['79', '136', '137', '138', '139', '140', '141', '142', '151', '160', '161', '162', '186', '187', '188', '197', '198', '199', '299', '301', '302', '303', '319', '320', '321', '322', '323', '323a', '324', '325', '326', '327', '328', '329', '330', '331', '332', '333', '334', '375']

# Make smaller color map, from here: https://stackoverflow.com/questions/18926031/how-to-extract-a-subset-of-a-colormap-as-a-new-colormap-in-matplotlib
def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
    new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
    return new_cmap
arr = np.linspace(0, 50, 100).reshape((10, 10))
cmap = plt.get_cmap('YlOrBr')
new_cmap = truncate_colormap(cmap, .2, 1)

# Make the figures for each antibody
for ab in processed_dfs:
    processed_df = processed_dfs[ab]
    processed_df = processed_df.assign(label=lambda x: x['wildtype'] + x['site'])
    processed_df['show_site'] = processed_df['site'].isin(all_sites_to_show)
    processed_df['clip'] = np.clip(processed_df['effect'], None, 0)
    min_prop = processed_df['effect'].min()
    max_prop = processed_df['clip'].max()
    map1 = ValueToColorMap(minvalue=min_prop, maxvalue=max_prop, cmap=new_cmap)
    processed_df['color'] = processed_df['clip'].map(map1.val_to_color)
    if ab!='':
        x_ticks = 'label'
    else: 
        processed_df['blanks'] = ''
        x_ticks = 'blanks'
    fig, ax = dmslogo.draw_logo(
        data=processed_df.query('show_site'),
        x_col='sequential_site',
        xtick_col=x_ticks,
        letter_col='mutant',
        letter_height_col='escape_median',
        ylabel='Escape',
        xlabel='',
        color_col='color',
        title=ab,
        axisfontscale=1.5,
    )
    fig.savefig(f'./results/logoplots/{ab}_logoplot.svg', bbox_inches='tight',format='svg')

fig, _ = map1.scale_bar(orientation='horizontal')
fig.savefig(f'./results/logoplots/scalebar_figure.svg', bbox_inches='tight',format='svg')

Generate logo plots¶