library_correlations.ipynb¶

This script calculates stats about functional selections and variants present in libraries.

  • The mean of all LibA selections vs the mean of all LibB selections

  • All selections for a specific condition

  • Histogram of variants by # of mutations

  • written by Brendan Larsen

In [1]:
# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
nipah_config = None

codon_variants_file = None

CHO_corr_plot_save = None
entry_binding_corr_heatmap_plot = None
CHO_EFNB2_indiv_plot_save = None
CHO_EFNB3_indiv_plot_save = None

histogram_plot = None
uniq_barcodes_per_lib_df = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
codon_variants_file = "results/variants/codon_variants.csv"
CHO_corr_plot_save = "results/images/CHO_corr_plot_save.html"
entry_binding_corr_heatmap_plot = "results/images/entry_binding_corr_heatmap_plot.html"
CHO_EFNB2_indiv_plot_save = "results/images/CHO_EFNB2_all_corrs.html"
CHO_EFNB3_indiv_plot_save = "results/images/CHO_EFNB3_all_corrs.html"
histogram_plot = "results/images/variants_histogram.html"
uniq_barcodes_per_lib_df = "results/tables/uniq_barcodes_per_lib_df.csv"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
from collections import Counter
import sys

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('main_theme')

For running interactively:

In [4]:
if histogram_plot is None:
    #altair_config = "data/custom_analyses_data/theme.py"
    nipah_config = "nipah_config.yaml"
    codon_variants_file = "results/variants/codon_variants.csv"

Read in config files

In [5]:
with open(nipah_config) as f:
    config = yaml.safe_load(f)

with open("data/func_effects_config.yml") as y:
    config_func = yaml.safe_load(y)

Read in selections¶

In [6]:
cho_efnb2_low_selections = config_func["avg_func_effects"]["CHO_bEFNB2"][
    "selections"
]
LibA_CHO_EFNB2 = [
    selection + "_func_effects.csv"
    for selection in cho_efnb2_low_selections
    if "LibA" in selection and "LibB" not in selection
]
LibB_CHO_EFNB2 = [
    selection + "_func_effects.csv"
    for selection in cho_efnb2_low_selections
    if "LibB" in selection and "LibA" not in selection
]

cho_efnb3_low_selections = config_func["avg_func_effects"]["CHO_bEFNB3"][
    "selections"
]
LibA_CHO_EFNB3 = [
    selection + "_func_effects.csv"
    for selection in cho_efnb3_low_selections
    if "LibA" in selection and "LibB" not in selection
]
LibB_CHO_EFNB3 = [
    selection + "_func_effects.csv"
    for selection in cho_efnb3_low_selections
    if "LibB" in selection and "LibA" not in selection
]

Calculate correlations for LibA and LibB for CHO-EFNB2 cell entry selections¶

In [7]:
# Define the base directory where CSV files are stored
path = "results/func_effects/by_selection/"


# Function to process functional selections from a specific library
def process_func_selections(library, library_name):
    df_list = []  # Initialize a list to store dataframes
    clock = 1  # Counter to uniquely name columns for each file

    # Loop through each file in the library
    for file_name in library:
        file_path = os.path.join(path, file_name)  # Construct the full file path

        # Read the CSV file into a dataframe, then rename and drop specific columns
        func_scores = pd.read_csv(file_path)
        func_scores_renamed = func_scores.rename(
            columns={
                "functional_effect": f"functional_effect_{clock}",
                "times_seen": f"times_seen_{clock}",
            }
        )
        func_scores_renamed.drop(["latent_phenotype_effect"], axis=1, inplace=True)

        df_list.append(func_scores_renamed)  # Append modified dataframe to the list
        clock += 1  # Increment counter

    # Merge all dataframes on 'site', 'mutant', and 'wildtype' columns
    merged_df = df_list[0]
    for df in df_list[1:]:
        merged_df = pd.merge(
            merged_df, df, on=["site", "mutant", "wildtype"], how="outer"
        )

    # Calculate median values of functional effects and times seen across all files
    lib_columns_func = [col for col in merged_df.columns if "functional_effect" in col]
    merged_df[f"mean_effect_{library_name}"] = merged_df[lib_columns_func].mean(
        axis=1
    )
    lib_columns_times_seen = [col for col in merged_df.columns if "times_seen" in col]
    merged_df[f"mean_times_seen_{library_name}"] = merged_df[
        lib_columns_times_seen
    ].mean(axis=1)

    # Drop intermediate columns used for calculations
    lib_columns = [col for col in merged_df.columns if re.search(r"_\d+", col)]
    merged_df = merged_df.drop(columns=lib_columns)
    return merged_df


# Process selections for two libraries and two cell types
A_selections_E2 = process_func_selections(LibA_CHO_EFNB2, "LibA")
B_selections_E2 = process_func_selections(LibB_CHO_EFNB2, "LibB")
A_selections_E3 = process_func_selections(LibA_CHO_EFNB3, "LibA")
B_selections_E3 = process_func_selections(LibB_CHO_EFNB3, "LibB")


# Function to merge selections from two libraries
def merge_selections(A_selections, B_selections):
    merged_selections = pd.merge(
        A_selections, B_selections, on=["wildtype", "site", "mutant"], how="inner"
    )
    lib_columns_times_seen = [
        col for col in merged_selections.columns if "times_seen" in col
    ]
    merged_selections["times_seen"] = merged_selections[lib_columns_times_seen].median(
        axis=1
    )
    return merged_selections


# Merge selections and add cell type information
CHO_EFNB2_merged = merge_selections(A_selections_E2, B_selections_E2)
CHO_EFNB2_merged["cell_type"] = "CHO-bEFNB2"
CHO_EFNB3_merged = merge_selections(A_selections_E3, B_selections_E3)
CHO_EFNB3_merged["cell_type"] = "CHO-bEFNB3"

# Concatenate merged selections for both cell types
both_selections = pd.concat([CHO_EFNB2_merged, CHO_EFNB3_merged])


# Function to generate and display a scatter plot comparing median effects from two libraries
def make_chart_median(df, title):
    slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
    selector = alt.param(name="SelectorName", value=1, bind=slider)

    empty = []
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["site", "mutant"], value=1
    )

    df = df[(df["mean_effect_LibA"].notna()) & (df["mean_effect_LibB"].notna())].round(2)
    size = df.shape[0]

    for selection in ["CHO-bEFNB2", "CHO-bEFNB3"]:
        print(selection)
        tmp_df = df[df["cell_type"] == selection]
        slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
            df[f"mean_effect_LibA"], df[f"mean_effect_LibB"]
        )
        r_value = float(r_value)
        print(f"{r_value:.2f}")

        chart = (
            alt.Chart(tmp_df, title=f"{selection}")
            .mark_point(filled=True,size=20)
            .encode(
                x=alt.X("mean_effect_LibA", title="Library A"),
                y=alt.Y("mean_effect_LibB", title="Library B"),
                tooltip=["site", "wildtype", "mutant", "times_seen"],
                size=alt.condition(variant_selector, alt.value(100), alt.value(15)),
                color=alt.condition(
                    alt.datum.times_seen < selector,
                    alt.value("transparent"),
                    alt.value("black"),
                ),
                opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.2)),
            )
        )
        empty.append(chart)
    combined_effect_chart = (
        alt.hconcat(*empty)
        .resolve_scale(y="shared", x="shared", color="independent")
        .add_params(variant_selector, selector)
    )
    return combined_effect_chart


CHO_EFNB2_corr_plot = make_chart_median(both_selections, "CHO-EFNB2")
CHO_EFNB2_corr_plot.display()
if histogram_plot is not None:
    CHO_EFNB2_corr_plot.save(CHO_corr_plot_save)
CHO-bEFNB2
0.92
CHO-bEFNB3
0.92
In [8]:
def plot_corr_heatmap(df):
    empty_chart = []

    for cell in list(df["cell_type"].unique()):
        tmp_df = df[df["cell_type"] == cell]
        chart = (
            alt.Chart(tmp_df, title=f"{cell}")
            .mark_rect()
            .encode(
                x=alt.X("mean_effect_LibA", title="Library A",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(
                    maxbins=80,
                ),
                y=alt.Y("mean_effect_LibB", title="Library B",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(
                    maxbins=80,
                ),
                color=alt.Color("count()", title="Count").scale(scheme="greenblue",type='log'),
                tooltip=['count()'],
            )
        )
        empty_chart.append(chart)

    combined_chart = alt.hconcat(
        *empty_chart, title=alt.Title("Correlation between cell entry measurements between libraries",offset=10)
    ).resolve_scale(y="shared", x="shared", color="shared")
    return combined_chart


entry_binding_corr_heatmap = plot_corr_heatmap(both_selections.query('times_seen >= 2'))
entry_binding_corr_heatmap.display()
entry_binding_corr_heatmap.save(entry_binding_corr_heatmap_plot)

Make correlations between individual selections¶

In [9]:
def process_individ_selections(library):
    df_list = []
    clock = 1
    for file_name in library:
        file_path = os.path.join(path, file_name)
        print(f"Processing file: {file_path}")
        # Read the current CSV file
        func_scores = pd.read_csv(file_path)
        func_scores_renamed = func_scores.rename(
            columns={
                "functional_effect": f"functional_effect_{clock}",
                "times_seen": f"times_seen_{clock}",
            }
        )
        func_scores_renamed.drop(["latent_phenotype_effect"], axis=1, inplace=True)

        # Append the dataframe to the list
        df_list.append(func_scores_renamed)
        clock = clock + 1

    # Merge all dataframes on 'site' and 'mutant'
    merged_df = df_list[0]
    for df in df_list[1:]:
        merged_df = pd.merge(
            merged_df, df, on=["site", "mutant", "wildtype"], how="outer"
        )
    # Make list of how many selections are done for later correlation plots
    lib_size = len(library)
    number_list = [i for i in range(1, lib_size + 1)]
    return merged_df, number_list


CHO_EFNB2_indiv, lib_size_EFNB2 = process_individ_selections(
    LibA_CHO_EFNB2 + LibB_CHO_EFNB2
)
CHO_EFNB3_indiv, lib_size_EFNB3 = process_individ_selections(
    LibA_CHO_EFNB3 + LibB_CHO_EFNB3
)


def make_chart(df, number_list):
    empty_list = []
    for i in number_list:
        other_empty_list = []
        for j in number_list:
            df = df[
                (df[f"times_seen_{i}"] >= config["func_times_seen_cutoff"])
                & (df[f"times_seen_{j}"] >= config["func_times_seen_cutoff"])
                & (df[f"functional_effect_{i}"].notna())
                & (df[f"functional_effect_{j}"].notna())
            ]

            chart = (
                alt.Chart(df)
                .mark_circle(size=10, color="black", opacity=0.2)
                .encode(
                    x=alt.X(f"functional_effect_{i}"),
                    y=alt.Y(f"functional_effect_{j}"),
                    tooltip=["site", "wildtype", "mutant"],
                )
                .properties(height=alt.Step(10), width=alt.Step(10))
            )
            other_empty_list.append(chart)
        combined_effect_chart = alt.hconcat(*other_empty_list).resolve_scale(
            y="shared", x="shared", color="independent"
        )
        empty_list.append(combined_effect_chart)
    final_combined_chart = alt.vconcat(*empty_list).resolve_scale(
        y="shared", x="shared", color="independent"
    )
    return final_combined_chart


CHO_EFNB2_indiv_plot = make_chart(CHO_EFNB2_indiv, lib_size_EFNB2)
if histogram_plot is not None:
    CHO_EFNB2_indiv_plot.save(CHO_EFNB2_indiv_plot_save)
CHO_EFNB3_indiv_plot = make_chart(CHO_EFNB3_indiv, lib_size_EFNB3)
if histogram_plot is not None:
    CHO_EFNB3_indiv_plot.save(CHO_EFNB3_indiv_plot_save)
Processing file: results/func_effects/by_selection/LibA-231112-CHO-bEFNB2_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231207-CHO-bEFNB2_1_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231207-CHO-bEFNB2_2_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231207-CHO-bEFNB2_3_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231207-CHO-bEFNB2_5_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231222-CHO-bEFNB2_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-231112-CHO-bEFNB2_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-231116-CHO-bEFNB2_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-230725-CHO-bEFNB3_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-230916-CHO-bEFNB3_func_effects.csv
Processing file: results/func_effects/by_selection/LibA-231024-CHO-bEFNB3_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-230720-CHO-bEFNB3_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-230815-CHO-bEFNB3_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-230818-CHO-bEFNB3_func_effects.csv
Processing file: results/func_effects/by_selection/LibB-231116-CHO-bEFNB3_func_effects.csv

Now make histogram of variants¶

In [10]:
codon_variants = pd.read_csv(codon_variants_file)
display(codon_variants.head(3))
unique_barcodes_per_library = codon_variants.groupby("library")["barcode"].nunique()

uniq_barcodes_per_lib = pd.DataFrame(unique_barcodes_per_library)
display(uniq_barcodes_per_lib)
target library barcode variant_call_support codon_substitutions aa_substitutions n_codon_substitutions n_aa_substitutions
0 gene LibA AAAAAAAAAAAAAGAA 5 ACC461ACT ATC475AGC I475S 2 1
1 gene LibA AAAAAAAAAAACCCAT 36 GCG16GAG CAG23GAG A16E Q23E 2 2
2 gene LibA AAAAAAAAAAAGTTTC 6 TAC319CCC Y319P 1 1
barcode
library
LibA 78450
LibB 60623

Find which sites are present, and which are missing¶

In [11]:
# Initialize an empty dictionary to keep track of observed mutations
aa_counts = {}
wildtypes = {}  # Dictionary to keep track of wildtype amino acids for each site


# Function to process each cell, update counts, and record wildtype amino acids
def process_cell(cell):
    if pd.notna(cell):  # Check if cell is not NaN
        substitutions = cell.split()
        for substitution in substitutions:
            if substitution[-1] not in ("*", "-") and substitution[0] not in (
                "*"
            ):  # Skip if substitution ends with '*' or '-'
                site = substitution[1:-1]
                mutation = substitution[-1]
                wildtype = substitution[0]
                site_mutation = site + mutation
                if site not in wildtypes:
                    wildtypes[site] = wildtype
                if site_mutation in aa_counts:
                    aa_counts[site_mutation] += 1
                else:
                    aa_counts[site_mutation] = 1


empty_mutants = []
empty_percent = []
for lib in ["LibA", "LibB"]:
    # Apply the function to each cell in the 'aa_substitutions' column
    tmp_df = codon_variants[codon_variants["library"] == lib]
    tmp_df["aa_substitutions"].apply(process_cell)

    # Generate all possible combinations excluding the wildtype for each site
    expected_sites = range(1, 533)
    possible_mutations = [
        "A",
        "C",
        "D",
        "E",
        "F",
        "G",
        "H",
        "I",
        "K",
        "L",
        "M",
        "N",
        "P",
        "Q",
        "R",
        "S",
        "T",
        "V",
        "W",
        "Y",
    ]

    # Adjust expected combinations to exclude the wildtype for each site
    expected_combinations = set()
    for site in expected_sites:
        site_str = str(site)
        if site_str in wildtypes:
            wildtype = wildtypes[site_str]
            for mutation in possible_mutations:
                if mutation != wildtype:  # Exclude the wildtype amino acid
                    expected_combinations.add(site_str + mutation)

    # Extract the actual combinations from the counts
    actual_combinations = set(aa_counts.keys())

    # Find missing combinations
    missing_combinations = expected_combinations - actual_combinations

    # Display results
    print(f"Number of unique site-mutation combinations observed: {len(aa_counts)}")
    print(
        f"Number of missing combinations (excluding wildtypes): {len(missing_combinations)}"
    )
    print(
        f"Total possible combinations excluding wildtypes: {len(expected_combinations)}"
    )
    empty_percent.append(len(actual_combinations) / len(expected_combinations))

uniq_barcodes_per_lib["percent"] = empty_percent
uniq_barcodes_per_lib["percent"] = uniq_barcodes_per_lib["percent"] * 100
uniq_barcodes_per_lib = uniq_barcodes_per_lib.round(2)
uniq_barcodes_per_lib = uniq_barcodes_per_lib.reset_index()
uniq_barcodes_per_lib.to_csv(uniq_barcodes_per_lib_df, index=False)
Number of unique site-mutation combinations observed: 10055
Number of missing combinations (excluding wildtypes): 54
Total possible combinations excluding wildtypes: 10108
Number of unique site-mutation combinations observed: 10080
Number of missing combinations (excluding wildtypes): 29
Total possible combinations excluding wildtypes: 10108
In [12]:
def calculate_fraction(library):
    total_A = codon_variants[codon_variants["library"] == library].shape[0]
    for number in range(5):
        fraction = codon_variants[
            (codon_variants["library"] == library)
            & (codon_variants["n_aa_substitutions"] == number)
        ].shape[0]
        print(
            f"For {library}, the fraction of sites with {number} mutations relative to WT are: {fraction/total_A:.2f}"
        )


calculate_fraction("LibA")
calculate_fraction("LibB")
For LibA, the fraction of sites with 0 mutations relative to WT are: 0.11
For LibA, the fraction of sites with 1 mutations relative to WT are: 0.64
For LibA, the fraction of sites with 2 mutations relative to WT are: 0.22
For LibA, the fraction of sites with 3 mutations relative to WT are: 0.03
For LibA, the fraction of sites with 4 mutations relative to WT are: 0.00
For LibB, the fraction of sites with 0 mutations relative to WT are: 0.11
For LibB, the fraction of sites with 1 mutations relative to WT are: 0.65
For LibB, the fraction of sites with 2 mutations relative to WT are: 0.21
For LibB, the fraction of sites with 3 mutations relative to WT are: 0.03
For LibB, the fraction of sites with 4 mutations relative to WT are: 0.00
In [13]:
def plot_histogram(df):
    df = df.drop(
        columns=[
            "barcode",
            "target",
            "variant_call_support",
            "codon_substitutions",
            "aa_substitutions",
            "n_codon_substitutions",
        ]
    )
    chart = (
        alt.Chart(df)
        .mark_bar(color="black")
        .encode(
            alt.X("n_aa_substitutions:O", title="# of AA Mutations",axis=alt.Axis(labelAngle=0)).scale(domain=[0,1,2,3,4,5,6,7]),
            alt.Y(
                "count()", title="Count", axis=alt.Axis(grid=False)
            ),  
            column=alt.Column(
                "library", header=alt.Header(title=None, labelFontSize=20,labelFont='Helvetica Light',labelFontStyle='bold')
            ),
        )
    )
    return chart


histogram = plot_histogram(codon_variants)
histogram.display()
if histogram_plot is not None:
    histogram.save(histogram_plot)
In [ ]: