Source code for af_analysis.docking

import numpy as np
import pdb_cpp

from tqdm.auto import tqdm
from . import data, analysis


# Autorship information
__author__ = "Alaa Reguei, Samuel Murail"
__copyright__ = "Copyright 2023, RPBS"
__credits__ = ["Samuel Murail", "Alaa Reguei"]
__license__ = "GNU General Public License version 2"
__version__ = "0.2.0"
__maintainer__ = "Samuel Murail"
__email__ = "samuel.murail@u-paris.fr"
__status__ = "Beta"

"""
The module contains functions to extract and compute docking scores.

.. warning:
    The ligand chain is assumed to be the last chain in the list of chains.
"""


[docs] def pae_pep(my_data, fun=np.mean): """Extract the PAE score for the receptor(s)-peptide interface. Parameters ---------- my_data : AF2Data object containing the data fun : function function to apply to the PAE scores Returns ------- None The `log_pd` dataframe is modified in place. """ pep_rec_pae_list = [] rec_pep_pae_list = [] disable = not getattr(my_data, "verbose", True) for i, (query, data_file) in tqdm( enumerate(zip(my_data.df["query"], my_data.df["data_file"])), total=len(my_data.df), disable=disable, ): chain_length = my_data.chain_length[query] cum_sum_chain = np.cumsum([0] + chain_length) pae = data.get_pae(data_file) # print(f"0:{cum_sum_chain[-2]} , {cum_sum_chain[-2]}:{cum_sum_chain[-1]}") # print(pae.shape) if pae is None: pep_rec_pae_list.append(None) rec_pep_pae_list.append(None) continue rec_pep_pae = fun( pae[0 : cum_sum_chain[-2], cum_sum_chain[-2] : cum_sum_chain[-1]] ) pep_rec_pae = fun( pae[cum_sum_chain[-2] : cum_sum_chain[-1], 0 : cum_sum_chain[-2]] ) pep_rec_pae_list.append(pep_rec_pae) rec_pep_pae_list.append(rec_pep_pae) my_data.df.loc[:, "PAE_pep_rec"] = pep_rec_pae_list my_data.df.loc[:, "PAE_rec_pep"] = rec_pep_pae_list
[docs] def pae_contact_pep(my_data, fun=np.mean, cutoff=8.0, max_pae=30.98): """Extract the PAE score for the receptor(s)-peptide interface. Parameters ---------- my_data : AF2Data object containing the data fun : function function to apply to the PAE scores Returns ------- None The `log_pd` dataframe is modified in place. """ pep_rec_pae_list = [] rec_pep_pae_list = [] disable = not getattr(my_data, "verbose", True) for i, (query, data_file, pdb) in tqdm( enumerate(zip(my_data.df["query"], my_data.df["data_file"], my_data.df["pdb"])), total=len(my_data.df), disable=disable, ): chains = my_data.chains[query] if pdb is None or data_file is None: pep_rec_pae_list.append(None) rec_pep_pae_list.append(None) continue pae = data.get_pae(data_file) model = pdb_cpp.Coor(pdb) model_CA = model.select_atoms("name CA") contact_lig = model_CA.select_atoms( f"chain {chains[-1]} and within {cutoff} of chain {' '.join(chains[:-1])}" ) contact_rec = model_CA.select_atoms( f"chain {' '.join(chains[:-1])} and within {cutoff} of chain {chains[-1]}" ) if contact_lig.len == 0 or contact_rec.len == 0: pep_rec_pae_list.append(max_pae) rec_pep_pae_list.append(max_pae) continue rec_mask = np.zeros(pae.shape) lig_mask = np.zeros(pae.shape) rec_mask[contact_rec.residue, :] = 1 lig_mask[:, contact_lig.residue] = 1 pair_mask = np.logical_and(rec_mask, lig_mask) rec_pep_pae = fun(pae[pair_mask]) pep_rec_pae = fun(pae[pair_mask.T]) pep_rec_pae_list.append(pep_rec_pae) rec_pep_pae_list.append(rec_pep_pae) my_data.df.loc[:, "PAE_contact_pep_rec"] = pep_rec_pae_list my_data.df.loc[:, "PAE_contact_rec_pep"] = rec_pep_pae_list
[docs] def plddt_pep(my_data, fun=np.mean): """Extract the pLDDT score for the peptide-peptide interface. Parameters ---------- my_data : AF2Data object containing the data fun : function function to apply to the pLDDT scores Returns ------- None The `log_pd` dataframe is modified in place. """ pep_plddt_list = [] disable = not getattr(my_data, "verbose", True) for i, (query, pdb) in tqdm( enumerate(zip(my_data.df["query"], my_data.df["pdb"])), total=len(my_data.df), disable=disable, ): chain_length = my_data.chain_length[query] cum_sum_chain = np.cumsum([0] + chain_length) plddt = my_data.get_plddt(i) if plddt is None: pep_plddt_list.append(None) continue pep_plddt_list.append(fun(plddt[cum_sum_chain[-2] : cum_sum_chain[-1]])) my_data.df.loc[:, "plddt_pep"] = pep_plddt_list
[docs] def plddt_contact_pep(my_data, fun=np.mean, cutoff=8.0): """Extract the pLDDT score for the peptide-peptide interface. Parameters ---------- my_data : AF2Data object containing the data fun : function function to apply to the pLDDT scores Returns ------- None The `log_pd` dataframe is modified in place. """ lig_plddt_list = [] rec_plddt_list = [] disable = not getattr(my_data, "verbose", True) for i, (query, pdb) in tqdm( enumerate(zip(my_data.df["query"], my_data.df["pdb"])), total=len(my_data.df), disable=disable, ): chains = my_data.chains[query] if len(chains) < 2: lig_plddt_list.append(None) rec_plddt_list.append(None) continue if pdb is None: lig_plddt_list.append(None) rec_plddt_list.append(None) continue model = pdb_cpp.Coor(pdb) model_CA = model.select_atoms("name CA") contact_lig = model_CA.select_atoms( f"chain {chains[-1]} and within {cutoff} of chain {' '.join(chains[:-1])}" ) contact_rec = model_CA.select_atoms( f"chain {' '.join(chains[:-1])} and within {cutoff} of chain {chains[-1]}" ) if contact_lig.len > 0: lig_plddt_list.append(fun(contact_lig.beta)) else: lig_plddt_list.append(0) if contact_rec.len > 0: rec_plddt_list.append(fun(contact_rec.beta)) else: rec_plddt_list.append(0) my_data.df.loc[:, "plddt_contact_lig"] = lig_plddt_list my_data.df.loc[:, "plddt_contact_rec"] = rec_plddt_list
[docs] def LIS_pep(my_data, pae_cutoff=12.0, fun=np.max): """Compute the LIS score for the peptide-peptide interface. Parameters ---------- my_data : AF2Data object containing the data pae_cutoff : float cutoff for native contacts, default is 12.0 A fun : function function to apply to the LIS matrix Returns ------- None The `log_pd` dataframe is modified in place. """ analysis.LIS_matrix(my_data, pae_cutoff=pae_cutoff) pep_LIS_list = [] pep_LIS2_list = [] for query, LIS in zip(my_data.df["query"], my_data.df["LIS"]): if LIS is None: pep_LIS_list.append(None) pep_LIS2_list.append(None) continue chain_num = len(my_data.chains[query]) if chain_num < 2: pep_LIS_list.append(None) pep_LIS2_list.append(None) continue LIS_array = np.array(LIS) pep_LIS_list.append(fun(LIS_array[0 : chain_num - 1, chain_num - 1])) pep_LIS2_list.append(fun(LIS_array[chain_num - 1, 0 : chain_num - 1])) my_data.df.loc[:, "LIS_rec_pep"] = pep_LIS2_list my_data.df.loc[:, "LIS_pep_rec"] = pep_LIS_list
[docs] def cLIS_lig(my_data, pae_cutoff=12.0, dict_cutoff=8.0, fun=np.max): """Compute the cLIS score for the peptide-peptide interface. Parameters ---------- my_data : AF2Data object containing the data pae_cutoff : float cutoff for native contacts, default is 12.0 A dist_cutoff : float cutoff for distance contacts, default is 8.0 A fun : function function to apply to the LIS matrix Returns ------- None The `log_pd` dataframe is modified in place. """ analysis.cLIS_matrix(my_data, pae_cutoff=pae_cutoff, dist_cutoff=dict_cutoff) pep_LIA_list = [] pep_LIA2_list = [] for query, LIA in zip(my_data.df["query"], my_data.df["cLIS"]): if LIA is None: pep_LIA_list.append(None) pep_LIA2_list.append(None) continue chain_num = len(my_data.chains[query]) if chain_num < 2: pep_LIA_list.append(None) pep_LIA2_list.append(None) continue LIA_array = np.array(LIA) pep_LIA_list.append(fun(LIA_array[0 : chain_num - 1, chain_num - 1])) pep_LIA2_list.append(fun(LIA_array[chain_num - 1, 0 : chain_num - 1])) my_data.df.loc[:, "cLIS_rec_lig"] = pep_LIA2_list my_data.df.loc[:, "cLIS_lig_rec"] = pep_LIA_list
[docs] def iLIS_lig(my_data, pae_cutoff=12.0, dict_cutoff=8.0, fun=np.max): """Compute the cLIS score for the peptide-peptide interface. Parameters ---------- my_data : AF2Data object containing the data pae_cutoff : float cutoff for native contacts, default is 12.0 A dist_cutoff : float cutoff for distance contacts, default is 8.0 A fun : function function to apply to the LIS matrix Returns ------- None The `log_pd` dataframe is modified in place. """ LIS_pep(my_data, pae_cutoff=pae_cutoff, fun=fun) cLIS_lig( my_data, pae_cutoff=pae_cutoff, dict_cutoff=dict_cutoff, fun=fun, ) my_data.df.loc[:, "iLIS_rec_lig"] = ( my_data.df.loc[:, "LIS_rec_pep"] * my_data.df.loc[:, "cLIS_rec_lig"] ) ** 0.5 my_data.df.loc[:, "iLIS_lig_rec"] = ( my_data.df.loc[:, "LIS_pep_rec"] * my_data.df.loc[:, "cLIS_lig_rec"] ) ** 0.5
[docs] def pdockq2_lig(my_data): """Compute the LIS score for the receptor-ligand interface. Parameters ---------- my_data : AF2Data object containing the data pae_cutoff : float cutoff for native contacts, default is 8.0 A fun : function function to apply to the LIS matrix Returns ------- None The `log_pd` dataframe is modified in place. """ analysis.pdockq2(my_data) old_query = "" pdockq2_list = [] for index, row in my_data.df.iterrows(): if row["query"] != old_query: old_query = row["query"] chains = my_data.chains[old_query] lig_chain = chains[-1] rec_chains = chains[:-1] pdockq2_list.append(row[f"pdockq2_{lig_chain}"]) my_data.df.loc[:, "pdockq2_lig"] = pdockq2_list
[docs] def ipTM_d0_lig(my_data, weight_avg=False): """Compute the ipTM_d0 score for the receptor-ligand interface. Parameters ---------- my_data : AF2Data object containing the data weight_avg : bool whether to weight the ipTM_d0 by the receptor chain lengths Returns ------- None The `my_data.df` dataframe is modified in place. """ analysis.ipTM_d0(my_data) old_query = "" ipTM_list = [] ipTM_rec_lig_list = [] ipTM_lig_rec_list = [] for index, row in my_data.df.iterrows(): if row["query"] != old_query: old_query = row["query"] chains = my_data.chains[old_query] chain_length = my_data.chain_length[old_query] # Get the shortest chain as ligand chain lig_index = np.argmin(chain_length) lig_chain = chains[lig_index] rec_chains = chains[:lig_index] + chains[lig_index + 1 :] rec_chain_indexes = list(range(lig_index)) + list( range(lig_index + 1, len(chains)) ) local_ipTM_list = [] local_ipTM_rec_lig_list = [] local_ipTM_lig_rec_list = [] if weight_avg: sum_chain_length = sum(my_data.chain_length[old_query]) else: sum_chain_length = len(rec_chains) for rec_chain, rec_index in zip(rec_chains, rec_chain_indexes): # Get the ipTM_d0 for the ligand chain if weight_avg: local_ipTM_list.append( row[f"ipTM_d0_{rec_chain}_{lig_chain}"] * my_data.chain_length[old_query][rec_index] ) local_ipTM_list.append( row[f"ipTM_d0_{lig_chain}_{rec_chain}"] * my_data.chain_length[old_query][rec_index] ) local_ipTM_rec_lig_list.append( row[f"ipTM_d0_{rec_chain}_{lig_chain}"] * my_data.chain_length[old_query][rec_index] ) local_ipTM_lig_rec_list.append( row[f"ipTM_d0_{lig_chain}_{rec_chain}"] * my_data.chain_length[old_query][rec_index] ) else: local_ipTM_list.append(row[f"ipTM_d0_{rec_chain}_{lig_chain}"]) local_ipTM_list.append(row[f"ipTM_d0_{lig_chain}_{rec_chain}"]) local_ipTM_rec_lig_list.append(row[f"ipTM_d0_{rec_chain}_{lig_chain}"]) local_ipTM_lig_rec_list.append(row[f"ipTM_d0_{lig_chain}_{rec_chain}"]) if sum_chain_length == 0: ipTM_list.append(0) ipTM_lig_rec_list.append(0) ipTM_rec_lig_list.append(0) else: ipTM_list.append(sum(local_ipTM_list) / (2 * sum_chain_length)) ipTM_rec_lig_list.append(sum(local_ipTM_rec_lig_list) / sum_chain_length) ipTM_lig_rec_list.append(sum(local_ipTM_lig_rec_list) / sum_chain_length) my_data.df.loc[:, "ipTM_d0_lig"] = ipTM_list my_data.df.loc[:, "ipTM_d0_rec_lig"] = ipTM_rec_lig_list my_data.df.loc[:, "ipTM_d0_lig_rec"] = ipTM_lig_rec_list
[docs] def ipSAE_lig(my_data, weight_avg=False): """Compute the ipSAE score for the receptor-ligand interface. Parameters ---------- my_data : AF2Data object containing the data Returns ------- None The `my_data.df` dataframe is modified in place. """ analysis.ipSAE(my_data) old_query = "" ipTM_list = [] for index, row in my_data.df.iterrows(): if row["query"] != old_query: old_query = row["query"] chains = my_data.chains[old_query] chain_length = my_data.chain_length[old_query] # Get the shortest chain as ligand chain lig_index = np.argmin(chain_length) lig_chain = chains[lig_index] rec_chains = chains[:lig_index] + chains[lig_index + 1 :] rec_chain_indexes = list(range(lig_index)) + list( range(lig_index + 1, len(chains)) ) local_ipTM_list = [] if weight_avg: sum_chain_length = sum(my_data.chain_length[old_query]) else: sum_chain_length = len(rec_chains) for rec_chain, rec_index in zip(rec_chains, rec_chain_indexes): # Get the ipTM_d0 for the ligand chain if weight_avg: local_ipTM_list.append( row[f"ipSAE_{rec_chain}_{lig_chain}"] * my_data.chain_length[old_query][rec_index] ) else: local_ipTM_list.append(row[f"ipSAE_{rec_chain}_{lig_chain}"]) if sum_chain_length == 0: ipTM_list.append(0) else: ipTM_list.append(sum(local_ipTM_list) / sum_chain_length) my_data.df.loc[:, "ipSAE_lig"] = ipTM_list
[docs] def ipTM_d0_interface_lig(my_data, weight_avg=False): """Compute the ipTM_d0 score for the receptor-ligand interface. Parameters ---------- my_data : AF2Data object containing the data weight_avg : bool whether to weight the ipTM_d0 by the receptor chain lengths Returns ------- None The `my_data.df` dataframe is modified in place. """ analysis.ipTM_d0_interface(my_data) old_query = "" ipTM_list = [] for index, row in my_data.df.iterrows(): if row["query"] != old_query: old_query = row["query"] chains = my_data.chains[old_query] chain_length = my_data.chain_length[old_query] # Get the shortest chain as ligand chain lig_index = np.argmin(chain_length) lig_chain = chains[lig_index] rec_chains = chains[:lig_index] + chains[lig_index + 1 :] rec_chain_indexes = list(range(lig_index)) + list( range(lig_index + 1, len(chains)) ) local_ipTM_list = [] if weight_avg: sum_chain_length = sum(my_data.chain_length[old_query]) else: sum_chain_length = len(rec_chains) for rec_chain, rec_index in zip(rec_chains, rec_chain_indexes): # Get the ipTM_d0 for the ligand chain if weight_avg: local_ipTM_list.append( row[f"ipTM_interface_{rec_chain}_{lig_chain}"] * my_data.chain_length[old_query][rec_index] ) else: local_ipTM_list.append(row[f"ipTM_interface_{rec_chain}_{lig_chain}"]) if sum_chain_length == 0: ipTM_list.append(0) else: ipTM_list.append(sum(local_ipTM_list) / sum_chain_length) my_data.df.loc[:, "ipTM_interface_lig"] = ipTM_list
[docs] def ipTM_between_chains(my_data, chain_groups): r""" Extract ipTM from pair_chain_iptm's array between user-specified chain groups. data : AF2Data object containing the data chains: list list of length 2 for the chain groups in the form of concatenated chain ids, between which the ipTM is extracted """ id_group1 = list(chain_groups[0]) id_group2 = list(chain_groups[1]) colname = ",".join(id_group1) + "-" + ",".join(id_group2) + "_ipTM" try: my_data.df["chain_pair_iptm"] except KeyError: print("No 'chain_pair_iptm' key in the json data") my_data.df[colname] = -1 # get chain index in chain_pair_iptm array from ids # /!\ this code assumes that the model's first chain id is A group1 = [ord(chain_id) - ord("A") for chain_id in id_group1] group2 = [ord(chain_id) - ord("A") for chain_id in id_group2] chain_group_iptm = [] disable = not getattr(my_data, "verbose", True) for iptm_array in tqdm( my_data.df["chain_pair_iptm"], total=len(my_data.df["pdb"]), disable=disable ): if isinstance(iptm_array, float) and np.isnan(iptm_array): chain_group_iptm.append(np.nan) continue to_avg = [] for chain1 in group1: for chain2 in group2: pair_iptm = iptm_array[chain1][chain2] to_avg.append(pair_iptm) chain_group_iptm.append(np.mean(to_avg)) # register the ipTM of the chain interface my_data.df[colname] = chain_group_iptm