import numpy as np
import pdb_cpp
from tqdm.auto import tqdm
from . import data, analysis
# Autorship information
__author__ = "Alaa Reguei, Samuel Murail"
__copyright__ = "Copyright 2023, RPBS"
__credits__ = ["Samuel Murail", "Alaa Reguei"]
__license__ = "GNU General Public License version 2"
__version__ = "0.2.0"
__maintainer__ = "Samuel Murail"
__email__ = "samuel.murail@u-paris.fr"
__status__ = "Beta"
"""
The module contains functions to extract and compute docking scores.
.. warning:
The ligand chain is assumed to be the last chain in the list of chains.
"""
[docs]
def pae_pep(my_data, fun=np.mean):
"""Extract the PAE score for the receptor(s)-peptide interface.
Parameters
----------
my_data : AF2Data
object containing the data
fun : function
function to apply to the PAE scores
Returns
-------
None
The `log_pd` dataframe is modified in place.
"""
pep_rec_pae_list = []
rec_pep_pae_list = []
disable = not getattr(my_data, "verbose", True)
for i, (query, data_file) in tqdm(
enumerate(zip(my_data.df["query"], my_data.df["data_file"])),
total=len(my_data.df),
disable=disable,
):
chain_length = my_data.chain_length[query]
cum_sum_chain = np.cumsum([0] + chain_length)
pae = data.get_pae(data_file)
# print(f"0:{cum_sum_chain[-2]} , {cum_sum_chain[-2]}:{cum_sum_chain[-1]}")
# print(pae.shape)
if pae is None:
pep_rec_pae_list.append(None)
rec_pep_pae_list.append(None)
continue
rec_pep_pae = fun(
pae[0 : cum_sum_chain[-2], cum_sum_chain[-2] : cum_sum_chain[-1]]
)
pep_rec_pae = fun(
pae[cum_sum_chain[-2] : cum_sum_chain[-1], 0 : cum_sum_chain[-2]]
)
pep_rec_pae_list.append(pep_rec_pae)
rec_pep_pae_list.append(rec_pep_pae)
my_data.df.loc[:, "PAE_pep_rec"] = pep_rec_pae_list
my_data.df.loc[:, "PAE_rec_pep"] = rec_pep_pae_list
[docs]
def plddt_pep(my_data, fun=np.mean):
"""Extract the pLDDT score for the peptide-peptide interface.
Parameters
----------
my_data : AF2Data
object containing the data
fun : function
function to apply to the pLDDT scores
Returns
-------
None
The `log_pd` dataframe is modified in place.
"""
pep_plddt_list = []
disable = not getattr(my_data, "verbose", True)
for i, (query, pdb) in tqdm(
enumerate(zip(my_data.df["query"], my_data.df["pdb"])),
total=len(my_data.df),
disable=disable,
):
chain_length = my_data.chain_length[query]
cum_sum_chain = np.cumsum([0] + chain_length)
plddt = my_data.get_plddt(i)
if plddt is None:
pep_plddt_list.append(None)
continue
pep_plddt_list.append(fun(plddt[cum_sum_chain[-2] : cum_sum_chain[-1]]))
my_data.df.loc[:, "plddt_pep"] = pep_plddt_list
[docs]
def LIS_pep(my_data, pae_cutoff=12.0, fun=np.max):
"""Compute the LIS score for the peptide-peptide interface.
Parameters
----------
my_data : AF2Data
object containing the data
pae_cutoff : float
cutoff for native contacts, default is 12.0 A
fun : function
function to apply to the LIS matrix
Returns
-------
None
The `log_pd` dataframe is modified in place.
"""
analysis.LIS_matrix(my_data, pae_cutoff=pae_cutoff)
pep_LIS_list = []
pep_LIS2_list = []
for query, LIS in zip(my_data.df["query"], my_data.df["LIS"]):
if LIS is None:
pep_LIS_list.append(None)
pep_LIS2_list.append(None)
continue
chain_num = len(my_data.chains[query])
if chain_num < 2:
pep_LIS_list.append(None)
pep_LIS2_list.append(None)
continue
LIS_array = np.array(LIS)
pep_LIS_list.append(fun(LIS_array[0 : chain_num - 1, chain_num - 1]))
pep_LIS2_list.append(fun(LIS_array[chain_num - 1, 0 : chain_num - 1]))
my_data.df.loc[:, "LIS_rec_pep"] = pep_LIS2_list
my_data.df.loc[:, "LIS_pep_rec"] = pep_LIS_list
[docs]
def cLIS_lig(my_data, pae_cutoff=12.0, dict_cutoff=8.0, fun=np.max):
"""Compute the cLIS score for the peptide-peptide interface.
Parameters
----------
my_data : AF2Data
object containing the data
pae_cutoff : float
cutoff for native contacts, default is 12.0 A
dist_cutoff : float
cutoff for distance contacts, default is 8.0 A
fun : function
function to apply to the LIS matrix
Returns
-------
None
The `log_pd` dataframe is modified in place.
"""
analysis.cLIS_matrix(my_data, pae_cutoff=pae_cutoff, dist_cutoff=dict_cutoff)
pep_LIA_list = []
pep_LIA2_list = []
for query, LIA in zip(my_data.df["query"], my_data.df["cLIS"]):
if LIA is None:
pep_LIA_list.append(None)
pep_LIA2_list.append(None)
continue
chain_num = len(my_data.chains[query])
if chain_num < 2:
pep_LIA_list.append(None)
pep_LIA2_list.append(None)
continue
LIA_array = np.array(LIA)
pep_LIA_list.append(fun(LIA_array[0 : chain_num - 1, chain_num - 1]))
pep_LIA2_list.append(fun(LIA_array[chain_num - 1, 0 : chain_num - 1]))
my_data.df.loc[:, "cLIS_rec_lig"] = pep_LIA2_list
my_data.df.loc[:, "cLIS_lig_rec"] = pep_LIA_list
[docs]
def iLIS_lig(my_data, pae_cutoff=12.0, dict_cutoff=8.0, fun=np.max):
"""Compute the cLIS score for the peptide-peptide interface.
Parameters
----------
my_data : AF2Data
object containing the data
pae_cutoff : float
cutoff for native contacts, default is 12.0 A
dist_cutoff : float
cutoff for distance contacts, default is 8.0 A
fun : function
function to apply to the LIS matrix
Returns
-------
None
The `log_pd` dataframe is modified in place.
"""
LIS_pep(my_data, pae_cutoff=pae_cutoff, fun=fun)
cLIS_lig(
my_data,
pae_cutoff=pae_cutoff,
dict_cutoff=dict_cutoff,
fun=fun,
)
my_data.df.loc[:, "iLIS_rec_lig"] = (
my_data.df.loc[:, "LIS_rec_pep"] * my_data.df.loc[:, "cLIS_rec_lig"]
) ** 0.5
my_data.df.loc[:, "iLIS_lig_rec"] = (
my_data.df.loc[:, "LIS_pep_rec"] * my_data.df.loc[:, "cLIS_lig_rec"]
) ** 0.5
[docs]
def pdockq2_lig(my_data):
"""Compute the LIS score for the receptor-ligand interface.
Parameters
----------
my_data : AF2Data
object containing the data
pae_cutoff : float
cutoff for native contacts, default is 8.0 A
fun : function
function to apply to the LIS matrix
Returns
-------
None
The `log_pd` dataframe is modified in place.
"""
analysis.pdockq2(my_data)
old_query = ""
pdockq2_list = []
for index, row in my_data.df.iterrows():
if row["query"] != old_query:
old_query = row["query"]
chains = my_data.chains[old_query]
lig_chain = chains[-1]
rec_chains = chains[:-1]
pdockq2_list.append(row[f"pdockq2_{lig_chain}"])
my_data.df.loc[:, "pdockq2_lig"] = pdockq2_list
[docs]
def ipTM_d0_lig(my_data, weight_avg=False):
"""Compute the ipTM_d0 score for the receptor-ligand interface.
Parameters
----------
my_data : AF2Data
object containing the data
weight_avg : bool
whether to weight the ipTM_d0 by the receptor chain lengths
Returns
-------
None
The `my_data.df` dataframe is modified in place.
"""
analysis.ipTM_d0(my_data)
old_query = ""
ipTM_list = []
ipTM_rec_lig_list = []
ipTM_lig_rec_list = []
for index, row in my_data.df.iterrows():
if row["query"] != old_query:
old_query = row["query"]
chains = my_data.chains[old_query]
chain_length = my_data.chain_length[old_query]
# Get the shortest chain as ligand chain
lig_index = np.argmin(chain_length)
lig_chain = chains[lig_index]
rec_chains = chains[:lig_index] + chains[lig_index + 1 :]
rec_chain_indexes = list(range(lig_index)) + list(
range(lig_index + 1, len(chains))
)
local_ipTM_list = []
local_ipTM_rec_lig_list = []
local_ipTM_lig_rec_list = []
if weight_avg:
sum_chain_length = sum(my_data.chain_length[old_query])
else:
sum_chain_length = len(rec_chains)
for rec_chain, rec_index in zip(rec_chains, rec_chain_indexes):
# Get the ipTM_d0 for the ligand chain
if weight_avg:
local_ipTM_list.append(
row[f"ipTM_d0_{rec_chain}_{lig_chain}"]
* my_data.chain_length[old_query][rec_index]
)
local_ipTM_list.append(
row[f"ipTM_d0_{lig_chain}_{rec_chain}"]
* my_data.chain_length[old_query][rec_index]
)
local_ipTM_rec_lig_list.append(
row[f"ipTM_d0_{rec_chain}_{lig_chain}"]
* my_data.chain_length[old_query][rec_index]
)
local_ipTM_lig_rec_list.append(
row[f"ipTM_d0_{lig_chain}_{rec_chain}"]
* my_data.chain_length[old_query][rec_index]
)
else:
local_ipTM_list.append(row[f"ipTM_d0_{rec_chain}_{lig_chain}"])
local_ipTM_list.append(row[f"ipTM_d0_{lig_chain}_{rec_chain}"])
local_ipTM_rec_lig_list.append(row[f"ipTM_d0_{rec_chain}_{lig_chain}"])
local_ipTM_lig_rec_list.append(row[f"ipTM_d0_{lig_chain}_{rec_chain}"])
if sum_chain_length == 0:
ipTM_list.append(0)
ipTM_lig_rec_list.append(0)
ipTM_rec_lig_list.append(0)
else:
ipTM_list.append(sum(local_ipTM_list) / (2 * sum_chain_length))
ipTM_rec_lig_list.append(sum(local_ipTM_rec_lig_list) / sum_chain_length)
ipTM_lig_rec_list.append(sum(local_ipTM_lig_rec_list) / sum_chain_length)
my_data.df.loc[:, "ipTM_d0_lig"] = ipTM_list
my_data.df.loc[:, "ipTM_d0_rec_lig"] = ipTM_rec_lig_list
my_data.df.loc[:, "ipTM_d0_lig_rec"] = ipTM_lig_rec_list
[docs]
def ipSAE_lig(my_data, weight_avg=False):
"""Compute the ipSAE score for the receptor-ligand interface.
Parameters
----------
my_data : AF2Data
object containing the data
Returns
-------
None
The `my_data.df` dataframe is modified in place.
"""
analysis.ipSAE(my_data)
old_query = ""
ipTM_list = []
for index, row in my_data.df.iterrows():
if row["query"] != old_query:
old_query = row["query"]
chains = my_data.chains[old_query]
chain_length = my_data.chain_length[old_query]
# Get the shortest chain as ligand chain
lig_index = np.argmin(chain_length)
lig_chain = chains[lig_index]
rec_chains = chains[:lig_index] + chains[lig_index + 1 :]
rec_chain_indexes = list(range(lig_index)) + list(
range(lig_index + 1, len(chains))
)
local_ipTM_list = []
if weight_avg:
sum_chain_length = sum(my_data.chain_length[old_query])
else:
sum_chain_length = len(rec_chains)
for rec_chain, rec_index in zip(rec_chains, rec_chain_indexes):
# Get the ipTM_d0 for the ligand chain
if weight_avg:
local_ipTM_list.append(
row[f"ipSAE_{rec_chain}_{lig_chain}"]
* my_data.chain_length[old_query][rec_index]
)
else:
local_ipTM_list.append(row[f"ipSAE_{rec_chain}_{lig_chain}"])
if sum_chain_length == 0:
ipTM_list.append(0)
else:
ipTM_list.append(sum(local_ipTM_list) / sum_chain_length)
my_data.df.loc[:, "ipSAE_lig"] = ipTM_list
[docs]
def ipTM_d0_interface_lig(my_data, weight_avg=False):
"""Compute the ipTM_d0 score for the receptor-ligand interface.
Parameters
----------
my_data : AF2Data
object containing the data
weight_avg : bool
whether to weight the ipTM_d0 by the receptor chain lengths
Returns
-------
None
The `my_data.df` dataframe is modified in place.
"""
analysis.ipTM_d0_interface(my_data)
old_query = ""
ipTM_list = []
for index, row in my_data.df.iterrows():
if row["query"] != old_query:
old_query = row["query"]
chains = my_data.chains[old_query]
chain_length = my_data.chain_length[old_query]
# Get the shortest chain as ligand chain
lig_index = np.argmin(chain_length)
lig_chain = chains[lig_index]
rec_chains = chains[:lig_index] + chains[lig_index + 1 :]
rec_chain_indexes = list(range(lig_index)) + list(
range(lig_index + 1, len(chains))
)
local_ipTM_list = []
if weight_avg:
sum_chain_length = sum(my_data.chain_length[old_query])
else:
sum_chain_length = len(rec_chains)
for rec_chain, rec_index in zip(rec_chains, rec_chain_indexes):
# Get the ipTM_d0 for the ligand chain
if weight_avg:
local_ipTM_list.append(
row[f"ipTM_interface_{rec_chain}_{lig_chain}"]
* my_data.chain_length[old_query][rec_index]
)
else:
local_ipTM_list.append(row[f"ipTM_interface_{rec_chain}_{lig_chain}"])
if sum_chain_length == 0:
ipTM_list.append(0)
else:
ipTM_list.append(sum(local_ipTM_list) / sum_chain_length)
my_data.df.loc[:, "ipTM_interface_lig"] = ipTM_list
[docs]
def ipTM_between_chains(my_data, chain_groups):
r"""
Extract ipTM from pair_chain_iptm's array between user-specified chain groups.
data : AF2Data
object containing the data
chains: list
list of length 2 for the chain groups in the form of concatenated
chain ids, between which the ipTM is extracted
"""
id_group1 = list(chain_groups[0])
id_group2 = list(chain_groups[1])
colname = ",".join(id_group1) + "-" + ",".join(id_group2) + "_ipTM"
try:
my_data.df["chain_pair_iptm"]
except KeyError:
print("No 'chain_pair_iptm' key in the json data")
my_data.df[colname] = -1
# get chain index in chain_pair_iptm array from ids
# /!\ this code assumes that the model's first chain id is A
group1 = [ord(chain_id) - ord("A") for chain_id in id_group1]
group2 = [ord(chain_id) - ord("A") for chain_id in id_group2]
chain_group_iptm = []
disable = not getattr(my_data, "verbose", True)
for iptm_array in tqdm(
my_data.df["chain_pair_iptm"], total=len(my_data.df["pdb"]), disable=disable
):
if isinstance(iptm_array, float) and np.isnan(iptm_array):
chain_group_iptm.append(np.nan)
continue
to_avg = []
for chain1 in group1:
for chain2 in group2:
pair_iptm = iptm_array[chain1][chain2]
to_avg.append(pair_iptm)
chain_group_iptm.append(np.mean(to_avg))
# register the ipTM of the chain interface
my_data.df[colname] = chain_group_iptm