library_correlations.ipynb¶
This script calculates stats about functional selections and variants present in libraries.
The mean of all LibA selections vs the mean of all LibB selections
All selections for a specific condition
Histogram of variants by # of mutations
written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
altair_config = None
nipah_config = None
codon_variants_file = None
CHO_corr_plot_save = None
entry_binding_corr_heatmap_plot = None
CHO_EFNB2_indiv_plot_save = None
CHO_EFNB3_indiv_plot_save = None
histogram_plot = None
uniq_barcodes_per_lib_df = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
codon_variants_file = "results/variants/codon_variants.csv"
CHO_corr_plot_save = "results/images/CHO_corr_plot_save.html"
entry_binding_corr_heatmap_plot = "results/images/entry_binding_corr_heatmap_plot.html"
CHO_EFNB2_indiv_plot_save = "results/images/CHO_EFNB2_all_corrs.html"
CHO_EFNB3_indiv_plot_save = "results/images/CHO_EFNB3_all_corrs.html"
histogram_plot = "results/images/variants_histogram.html"
uniq_barcodes_per_lib_df = "results/tables/uniq_barcodes_per_lib_df.csv"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
from collections import Counter
import sys
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
# setup working directory
if os.getcwd() == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/":
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
#import altair themes from /data/custom_analyses_data/theme.py and enable
sys.path.append('data/custom_analyses_data/')
import theme
alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')
Setup in correct directory
Out[3]:
ThemeRegistry.enable('main_theme')
For running interactively:
In [4]:
if histogram_plot is None:
#altair_config = "data/custom_analyses_data/theme.py"
nipah_config = "nipah_config.yaml"
codon_variants_file = "results/variants/codon_variants.csv"
Read in config files
In [5]:
with open(nipah_config) as f:
config = yaml.safe_load(f)
with open("data/func_effects_config.yml") as y:
config_func = yaml.safe_load(y)
Read in selections¶
In [6]:
cho_efnb2_low_selections = config_func["avg_func_effects"]["CHO_bEFNB2"][
"selections"
]
LibA_CHO_EFNB2 = [
selection + "_func_effects.csv"
for selection in cho_efnb2_low_selections
if "LibA" in selection and "LibB" not in selection
]
LibB_CHO_EFNB2 = [
selection + "_func_effects.csv"
for selection in cho_efnb2_low_selections
if "LibB" in selection and "LibA" not in selection
]
cho_efnb3_low_selections = config_func["avg_func_effects"]["CHO_bEFNB3"][
"selections"
]
LibA_CHO_EFNB3 = [
selection + "_func_effects.csv"
for selection in cho_efnb3_low_selections
if "LibA" in selection and "LibB" not in selection
]
LibB_CHO_EFNB3 = [
selection + "_func_effects.csv"
for selection in cho_efnb3_low_selections
if "LibB" in selection and "LibA" not in selection
]
Calculate correlations for LibA and LibB for CHO-EFNB2 cell entry selections¶
In [7]:
# Define the base directory where CSV files are stored
path = "results/func_effects/by_selection/"
# Function to process functional selections from a specific library
def process_func_selections(library, library_name):
df_list = [] # Initialize a list to store dataframes
clock = 1 # Counter to uniquely name columns for each file
# Loop through each file in the library
for file_name in library:
file_path = os.path.join(path, file_name) # Construct the full file path
# Read the CSV file into a dataframe, then rename and drop specific columns
func_scores = pd.read_csv(file_path)
func_scores_renamed = func_scores.rename(
columns={
"functional_effect": f"functional_effect_{clock}",
"times_seen": f"times_seen_{clock}",
}
)
func_scores_renamed.drop(["latent_phenotype_effect"], axis=1, inplace=True)
df_list.append(func_scores_renamed) # Append modified dataframe to the list
clock += 1 # Increment counter
# Merge all dataframes on 'site', 'mutant', and 'wildtype' columns
merged_df = df_list[0]
for df in df_list[1:]:
merged_df = pd.merge(
merged_df, df, on=["site", "mutant", "wildtype"], how="outer"
)
# Calculate median values of functional effects and times seen across all files
lib_columns_func = [col for col in merged_df.columns if "functional_effect" in col]
merged_df[f"mean_effect_{library_name}"] = merged_df[lib_columns_func].mean(
axis=1
)
lib_columns_times_seen = [col for col in merged_df.columns if "times_seen" in col]
merged_df[f"mean_times_seen_{library_name}"] = merged_df[
lib_columns_times_seen
].mean(axis=1)
# Drop intermediate columns used for calculations
lib_columns = [col for col in merged_df.columns if re.search(r"_\d+", col)]
merged_df = merged_df.drop(columns=lib_columns)
return merged_df
# Process selections for two libraries and two cell types
A_selections_E2 = process_func_selections(LibA_CHO_EFNB2, "LibA")
B_selections_E2 = process_func_selections(LibB_CHO_EFNB2, "LibB")
A_selections_E3 = process_func_selections(LibA_CHO_EFNB3, "LibA")
B_selections_E3 = process_func_selections(LibB_CHO_EFNB3, "LibB")
# Function to merge selections from two libraries
def merge_selections(A_selections, B_selections):
merged_selections = pd.merge(
A_selections, B_selections, on=["wildtype", "site", "mutant"], how="inner"
)
lib_columns_times_seen = [
col for col in merged_selections.columns if "times_seen" in col
]
merged_selections["times_seen"] = merged_selections[lib_columns_times_seen].median(
axis=1
)
return merged_selections
# Merge selections and add cell type information
CHO_EFNB2_merged = merge_selections(A_selections_E2, B_selections_E2)
CHO_EFNB2_merged["cell_type"] = "CHO-bEFNB2"
CHO_EFNB3_merged = merge_selections(A_selections_E3, B_selections_E3)
CHO_EFNB3_merged["cell_type"] = "CHO-bEFNB3"
# Concatenate merged selections for both cell types
both_selections = pd.concat([CHO_EFNB2_merged, CHO_EFNB3_merged])
# Function to generate and display a scatter plot comparing median effects from two libraries
def make_chart_median(df, title):
slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=1, bind=slider)
empty = []
variant_selector = alt.selection_point(
on="mouseover", empty=False, nearest=True, fields=["site", "mutant"], value=1
)
df = df[(df["mean_effect_LibA"].notna()) & (df["mean_effect_LibB"].notna())].round(2)
size = df.shape[0]
for selection in ["CHO-bEFNB2", "CHO-bEFNB3"]:
print(selection)
tmp_df = df[df["cell_type"] == selection]
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
df[f"mean_effect_LibA"], df[f"mean_effect_LibB"]
)
r_value = float(r_value)
print(f"{r_value:.2f}")
chart = (
alt.Chart(tmp_df, title=f"{selection}")
.mark_point(filled=True,size=20)
.encode(
x=alt.X("mean_effect_LibA", title="Library A"),
y=alt.Y("mean_effect_LibB", title="Library B"),
tooltip=["site", "wildtype", "mutant", "times_seen"],
size=alt.condition(variant_selector, alt.value(100), alt.value(15)),
color=alt.condition(
alt.datum.times_seen < selector,
alt.value("transparent"),
alt.value("black"),
),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.2)),
)
)
empty.append(chart)
combined_effect_chart = (
alt.hconcat(*empty)
.resolve_scale(y="shared", x="shared", color="independent")
.add_params(variant_selector, selector)
)
return combined_effect_chart
CHO_EFNB2_corr_plot = make_chart_median(both_selections, "CHO-EFNB2")
CHO_EFNB2_corr_plot.display()
if histogram_plot is not None:
CHO_EFNB2_corr_plot.save(CHO_corr_plot_save)
CHO-bEFNB2 0.92 CHO-bEFNB3 0.92
In [8]:
def plot_corr_heatmap(df):
empty_chart = []
for cell in list(df["cell_type"].unique()):
tmp_df = df[df["cell_type"] == cell]
chart = (
alt.Chart(tmp_df, title=f"{cell}")
.mark_rect()
.encode(
x=alt.X("mean_effect_LibA", title="Library A",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(
maxbins=80,
),
y=alt.Y("mean_effect_LibB", title="Library B",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(
maxbins=80,
),
color=alt.Color("count()", title="Count").scale(scheme="greenblue",type='log'),
tooltip=['count()'],
)
)
empty_chart.append(chart)
combined_chart = alt.hconcat(
*empty_chart, title=alt.Title("Correlation between cell entry measurements between libraries",offset=10)
).resolve_scale(y="shared", x="shared", color="shared")
return combined_chart
entry_binding_corr_heatmap = plot_corr_heatmap(both_selections.query('times_seen >= 2'))
entry_binding_corr_heatmap.display()
entry_binding_corr_heatmap.save(entry_binding_corr_heatmap_plot)
Make correlations between individual selections¶
In [9]:
def process_individ_selections(library):
df_list = []
clock = 1
for file_name in library:
file_path = os.path.join(path, file_name)
print(f"Processing file: {file_path}")
# Read the current CSV file
func_scores = pd.read_csv(file_path)
func_scores_renamed = func_scores.rename(
columns={
"functional_effect": f"functional_effect_{clock}",
"times_seen": f"times_seen_{clock}",
}
)
func_scores_renamed.drop(["latent_phenotype_effect"], axis=1, inplace=True)
# Append the dataframe to the list
df_list.append(func_scores_renamed)
clock = clock + 1
# Merge all dataframes on 'site' and 'mutant'
merged_df = df_list[0]
for df in df_list[1:]:
merged_df = pd.merge(
merged_df, df, on=["site", "mutant", "wildtype"], how="outer"
)
# Make list of how many selections are done for later correlation plots
lib_size = len(library)
number_list = [i for i in range(1, lib_size + 1)]
return merged_df, number_list
CHO_EFNB2_indiv, lib_size_EFNB2 = process_individ_selections(
LibA_CHO_EFNB2 + LibB_CHO_EFNB2
)
CHO_EFNB3_indiv, lib_size_EFNB3 = process_individ_selections(
LibA_CHO_EFNB3 + LibB_CHO_EFNB3
)
def make_chart(df, number_list):
empty_list = []
for i in number_list:
other_empty_list = []
for j in number_list:
df = df[
(df[f"times_seen_{i}"] >= config["func_times_seen_cutoff"])
& (df[f"times_seen_{j}"] >= config["func_times_seen_cutoff"])
& (df[f"functional_effect_{i}"].notna())
& (df[f"functional_effect_{j}"].notna())
]
chart = (
alt.Chart(df)
.mark_circle(size=10, color="black", opacity=0.2)
.encode(
x=alt.X(f"functional_effect_{i}"),
y=alt.Y(f"functional_effect_{j}"),
tooltip=["site", "wildtype", "mutant"],
)
.properties(height=alt.Step(10), width=alt.Step(10))
)
other_empty_list.append(chart)
combined_effect_chart = alt.hconcat(*other_empty_list).resolve_scale(
y="shared", x="shared", color="independent"
)
empty_list.append(combined_effect_chart)
final_combined_chart = alt.vconcat(*empty_list).resolve_scale(
y="shared", x="shared", color="independent"
)
return final_combined_chart
CHO_EFNB2_indiv_plot = make_chart(CHO_EFNB2_indiv, lib_size_EFNB2)
if histogram_plot is not None:
CHO_EFNB2_indiv_plot.save(CHO_EFNB2_indiv_plot_save)
CHO_EFNB3_indiv_plot = make_chart(CHO_EFNB3_indiv, lib_size_EFNB3)
if histogram_plot is not None:
CHO_EFNB3_indiv_plot.save(CHO_EFNB3_indiv_plot_save)
Processing file: results/func_effects/by_selection/LibA-231112-CHO-bEFNB2_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231207-CHO-bEFNB2_1_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231207-CHO-bEFNB2_2_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231207-CHO-bEFNB2_3_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231207-CHO-bEFNB2_5_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231222-CHO-bEFNB2_func_effects.csv Processing file: results/func_effects/by_selection/LibB-231112-CHO-bEFNB2_func_effects.csv Processing file: results/func_effects/by_selection/LibB-231116-CHO-bEFNB2_func_effects.csv Processing file: results/func_effects/by_selection/LibA-230725-CHO-bEFNB3_func_effects.csv Processing file: results/func_effects/by_selection/LibA-230916-CHO-bEFNB3_func_effects.csv Processing file: results/func_effects/by_selection/LibA-231024-CHO-bEFNB3_func_effects.csv Processing file: results/func_effects/by_selection/LibB-230720-CHO-bEFNB3_func_effects.csv Processing file: results/func_effects/by_selection/LibB-230815-CHO-bEFNB3_func_effects.csv Processing file: results/func_effects/by_selection/LibB-230818-CHO-bEFNB3_func_effects.csv Processing file: results/func_effects/by_selection/LibB-231116-CHO-bEFNB3_func_effects.csv
Now make histogram of variants¶
In [10]:
codon_variants = pd.read_csv(codon_variants_file)
display(codon_variants.head(3))
unique_barcodes_per_library = codon_variants.groupby("library")["barcode"].nunique()
uniq_barcodes_per_lib = pd.DataFrame(unique_barcodes_per_library)
display(uniq_barcodes_per_lib)
target | library | barcode | variant_call_support | codon_substitutions | aa_substitutions | n_codon_substitutions | n_aa_substitutions | |
---|---|---|---|---|---|---|---|---|
0 | gene | LibA | AAAAAAAAAAAAAGAA | 5 | ACC461ACT ATC475AGC | I475S | 2 | 1 |
1 | gene | LibA | AAAAAAAAAAACCCAT | 36 | GCG16GAG CAG23GAG | A16E Q23E | 2 | 2 |
2 | gene | LibA | AAAAAAAAAAAGTTTC | 6 | TAC319CCC | Y319P | 1 | 1 |
barcode | |
---|---|
library | |
LibA | 78450 |
LibB | 60623 |
Find which sites are present, and which are missing¶
In [11]:
# Initialize an empty dictionary to keep track of observed mutations
aa_counts = {}
wildtypes = {} # Dictionary to keep track of wildtype amino acids for each site
# Function to process each cell, update counts, and record wildtype amino acids
def process_cell(cell):
if pd.notna(cell): # Check if cell is not NaN
substitutions = cell.split()
for substitution in substitutions:
if substitution[-1] not in ("*", "-") and substitution[0] not in (
"*"
): # Skip if substitution ends with '*' or '-'
site = substitution[1:-1]
mutation = substitution[-1]
wildtype = substitution[0]
site_mutation = site + mutation
if site not in wildtypes:
wildtypes[site] = wildtype
if site_mutation in aa_counts:
aa_counts[site_mutation] += 1
else:
aa_counts[site_mutation] = 1
empty_mutants = []
empty_percent = []
for lib in ["LibA", "LibB"]:
# Apply the function to each cell in the 'aa_substitutions' column
tmp_df = codon_variants[codon_variants["library"] == lib]
tmp_df["aa_substitutions"].apply(process_cell)
# Generate all possible combinations excluding the wildtype for each site
expected_sites = range(1, 533)
possible_mutations = [
"A",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"K",
"L",
"M",
"N",
"P",
"Q",
"R",
"S",
"T",
"V",
"W",
"Y",
]
# Adjust expected combinations to exclude the wildtype for each site
expected_combinations = set()
for site in expected_sites:
site_str = str(site)
if site_str in wildtypes:
wildtype = wildtypes[site_str]
for mutation in possible_mutations:
if mutation != wildtype: # Exclude the wildtype amino acid
expected_combinations.add(site_str + mutation)
# Extract the actual combinations from the counts
actual_combinations = set(aa_counts.keys())
# Find missing combinations
missing_combinations = expected_combinations - actual_combinations
# Display results
print(f"Number of unique site-mutation combinations observed: {len(aa_counts)}")
print(
f"Number of missing combinations (excluding wildtypes): {len(missing_combinations)}"
)
print(
f"Total possible combinations excluding wildtypes: {len(expected_combinations)}"
)
empty_percent.append(len(actual_combinations) / len(expected_combinations))
uniq_barcodes_per_lib["percent"] = empty_percent
uniq_barcodes_per_lib["percent"] = uniq_barcodes_per_lib["percent"] * 100
uniq_barcodes_per_lib = uniq_barcodes_per_lib.round(2)
uniq_barcodes_per_lib = uniq_barcodes_per_lib.reset_index()
uniq_barcodes_per_lib.to_csv(uniq_barcodes_per_lib_df, index=False)
Number of unique site-mutation combinations observed: 10055 Number of missing combinations (excluding wildtypes): 54 Total possible combinations excluding wildtypes: 10108 Number of unique site-mutation combinations observed: 10080 Number of missing combinations (excluding wildtypes): 29 Total possible combinations excluding wildtypes: 10108
In [12]:
def calculate_fraction(library):
total_A = codon_variants[codon_variants["library"] == library].shape[0]
for number in range(5):
fraction = codon_variants[
(codon_variants["library"] == library)
& (codon_variants["n_aa_substitutions"] == number)
].shape[0]
print(
f"For {library}, the fraction of sites with {number} mutations relative to WT are: {fraction/total_A:.2f}"
)
calculate_fraction("LibA")
calculate_fraction("LibB")
For LibA, the fraction of sites with 0 mutations relative to WT are: 0.11 For LibA, the fraction of sites with 1 mutations relative to WT are: 0.64 For LibA, the fraction of sites with 2 mutations relative to WT are: 0.22 For LibA, the fraction of sites with 3 mutations relative to WT are: 0.03 For LibA, the fraction of sites with 4 mutations relative to WT are: 0.00 For LibB, the fraction of sites with 0 mutations relative to WT are: 0.11 For LibB, the fraction of sites with 1 mutations relative to WT are: 0.65 For LibB, the fraction of sites with 2 mutations relative to WT are: 0.21 For LibB, the fraction of sites with 3 mutations relative to WT are: 0.03 For LibB, the fraction of sites with 4 mutations relative to WT are: 0.00
In [13]:
def plot_histogram(df):
df = df.drop(
columns=[
"barcode",
"target",
"variant_call_support",
"codon_substitutions",
"aa_substitutions",
"n_codon_substitutions",
]
)
chart = (
alt.Chart(df)
.mark_bar(color="black")
.encode(
alt.X("n_aa_substitutions:O", title="# of AA Mutations",axis=alt.Axis(labelAngle=0)).scale(domain=[0,1,2,3,4,5,6,7]),
alt.Y(
"count()", title="Count", axis=alt.Axis(grid=False)
),
column=alt.Column(
"library", header=alt.Header(title=None, labelFontSize=20,labelFont='Helvetica Light',labelFontStyle='bold')
),
)
)
return chart
histogram = plot_histogram(codon_variants)
histogram.display()
if histogram_plot is not None:
histogram.save(histogram_plot)
In [ ]: