import itertools
import re
import matplotlib
import numpy as np
import pandas as pd
from faker import Factory
from pyccapt.calibration.data_tools import data_tools
[docs]
def fix_parentheses(c):
"""
Fix parentheses in a given string by expanding the elements inside them.
Args:
c (str): Input string containing parentheses.
Returns:
str: String with expanded elements inside parentheses.
"""
index = []
for i in range(len(c)):
if c[i] == '(':
index.append(i + 1)
if c[i] == ')':
index.append(i)
index.append(int(c[i + 1]))
index = list(chunks(index, 3))
list_parentheses = []
for i in range(len(index)):
tmp = c[index[i][0] : index[i][1]]
tmp = re.findall('[A-Z][^A-Z]*', tmp)
for j in range(len(tmp)):
if tmp[j].isalpha():
tmp[j] = tmp[j] + str(index[i][-1])
elif not tmp[j].isalpha():
dd = int(re.findall(r'\d+', tmp[j])[0]) * index[i][-1]
tmp[j] = ''.join([p for p in tmp[j] if not p.isdigit()]) + str(dd)
list_parentheses.append("".join(tmp))
for i in range(len(list_parentheses)):
gg = list_parentheses[i]
c = list(c)
c[index[i][0] - 1 - (2 * i) : index[i][1] + 2] = list_parentheses[i]
return ''.join(c)
[docs]
def chunks(lst, n):
"""
Yield successive n-sized chunks from a list.
Args:
lst (list): The input list.
n (int): The chunk size.
Yields:
list: Successive n-sized chunks from the input list.
"""
for i in range(0, len(lst), n):
yield lst[i : i + n]
def _sequence_key(values):
"""Convert list-like candidate fields to stable tuple keys."""
if isinstance(values, np.ndarray):
values = values.tolist()
if isinstance(values, (list, tuple)):
return tuple(int(v) if isinstance(v, (np.integer, int)) else v for v in values)
return values
def _candidate_atom_count(complex_values):
"""Return the total number of atoms represented by a candidate ion."""
if isinstance(complex_values, np.ndarray):
complex_values = complex_values.tolist()
if isinstance(complex_values, (list, tuple)):
return int(np.sum(np.asarray(complex_values, dtype=float)))
return int(complex_values)
[docs]
def rank_candidate_assignments(df, target_mass=None, variables=None):
"""
Rank candidate atoms/molecules using mass error, abundance, and simplicity.
Lower score is better. The resulting dataframe is deduplicated and sorted so
the most plausible assignment appears first.
"""
if df is None or df.empty:
return df
ranked = df.copy()
ranked['n_atoms'] = ranked['complex'].apply(_candidate_atom_count)
if target_mass is not None:
target_mass = float(target_mass)
ranked['mass_error'] = np.abs(ranked['mass'].astype(float) - target_mass)
if target_mass != 0:
ranked['ppm_error'] = ranked['mass_error'] / abs(target_mass) * 1e6
else:
ranked['ppm_error'] = np.nan
else:
if 'mass_error' not in ranked.columns:
ranked['mass_error'] = np.nan
if 'ppm_error' not in ranked.columns:
ranked['ppm_error'] = np.nan
abundance_safe = np.clip(ranked['abundance'].astype(float), 1e-12, None)
ranked['score'] = (
np.nan_to_num(ranked['mass_error'].astype(float), nan=0.0)
+ 0.05 * np.maximum(ranked['n_atoms'].astype(float) - 1.0, 0.0)
+ 0.02 * np.maximum(np.log10(1.0 / abundance_safe), 0.0)
)
ranked['_element_key'] = ranked['element'].apply(_sequence_key)
ranked['_complex_key'] = ranked['complex'].apply(_sequence_key)
ranked['_isotope_key'] = ranked['isotope'].apply(_sequence_key)
ranked = ranked.sort_values(
by=['score', 'mass_error', 'n_atoms', 'mass'],
ascending=[True, True, True, True],
)
ranked = ranked.drop_duplicates(
subset=['_element_key', '_complex_key', '_isotope_key', 'charge'],
keep='first',
)
ranked = ranked.drop(columns=['_element_key', '_complex_key', '_isotope_key'])
if 'mass_error' in ranked.columns:
ranked['mass_error'] = ranked['mass_error'].round(5)
if 'ppm_error' in ranked.columns:
ranked['ppm_error'] = ranked['ppm_error'].round(1)
ranked['score'] = ranked['score'].round(5)
ranked = ranked.reset_index(drop=True)
if variables is not None:
variables.range_data_backup = ranked.copy()
return ranked
[docs]
def find_closest_elements(target_elem, num_elements, abundance_threshold=0.0, charge=4, variables=None):
"""
Find the closest elements to a target element.
Args:
target_elem (float): Target element.
num_elements (int): Number of closest elements to find.
abundance_threshold (float): Abundance threshold for filtering elements (as a percentage).
charge (int): Charge value.
variables (object): Object containing the variables.
Returns:
pd.DataFrame: DataFrame containing closest elements and their properties.
"""
try:
data_table = '../../../files/isotopeTable.h5'
dataframe = pd.read_hdf(data_table)
except Exception as e:
try:
data_table = './pyccapt/files/isotopeTable.h5'
dataframe = pd.read_hdf(data_table)
except:
print("Error loading the file", e)
# Expand elements based on charge
elements = dataframe['element'].to_numpy()
isotope_number = dataframe['isotope'].to_numpy()
weight = dataframe['weight'].to_numpy()
abundance = dataframe['abundance'].to_numpy()
elements = np.repeat(elements, charge)
isotope_number = np.repeat(isotope_number, charge)
weights = np.repeat(weight, charge)
abundance = np.repeat(abundance, charge)
charge_list = np.array([i % charge + 1 for i in range(len(weights))])
weights = weights / charge_list
# Filter elements by abundance threshold
abundance_threshold *= 100
mask_abundanc = abundance > abundance_threshold
elements = elements[mask_abundanc]
isotope_number = isotope_number[mask_abundanc]
weights = weights[mask_abundanc]
abundance = abundance[mask_abundanc]
# Find closest elements
pool_size = min(len(weights), max(int(num_elements) * 5, int(num_elements)))
idxs = np.argsort(np.abs(weights - target_elem))[:pool_size]
selected_elements = elements[idxs]
selected_isotope_number = isotope_number[idxs]
selected_weights = weights[idxs]
selected_abundance = abundance[idxs]
selected_charge_list = charge_list[idxs]
# Create LaTeX formatted element symbols
element_symbols = []
for i in range(len(idxs)):
formula = ''
formula += '{}^'
formula += '{%s}' % selected_isotope_number[i]
formula += '%s' % selected_elements[i]
if selected_charge_list[i] > 1:
formula = r'$' + formula + '^{%s+}$' % selected_charge_list[i]
else:
formula = r'$' + formula + '^{+}$'
element_symbols.append(formula)
selected_elements = [[item] for item in selected_elements]
complex = np.ones(len(idxs), dtype=int)
complex = [[item] for item in complex]
selected_isotope_number = [[item] for item in selected_isotope_number]
selected_isotope_number = [[np.uint32(float(value[0]))] for value in selected_isotope_number]
complex = [[np.uint32(float(value[0]))] for value in complex]
selected_charge_list = [np.uint32(value) for value in selected_charge_list]
# Create DataFrame
df = pd.DataFrame(
{
'ion': element_symbols,
'mass': selected_weights,
'element': selected_elements,
'complex': complex,
'isotope': selected_isotope_number,
'charge': selected_charge_list,
'abundance': selected_abundance,
}
)
# Sort DataFrame
# Round the abundance column to 4 decimal places
df['abundance'] = df['abundance'].round(4)
# Divide all elements in abundance by 100
df['abundance'] = df['abundance'] / 100
df['mass'] = df['mass'].round(4)
df = rank_candidate_assignments(df, target_mass=target_elem, variables=variables)
return df.head(int(num_elements)).reset_index(drop=True)
[docs]
def load_elements(target_elements, abundance_threshold=0.0, charge=4, variables=None):
"""
create a dataframe from the given list of ions.
Args:
target_elements (str): Target elements.
abundance_threshold (float): Abundance threshold for filtering elements (as a percentage).
charge (int): Charge value.
data_table (str): Path to the data table (HDF5 file).
variables (object): Object containing the variables.
Returns:
pd.DataFrame: DataFrame containing closest elements and their properties.
"""
try:
data_table = '../../../files/isotopeTable.h5'
dataframe = pd.read_hdf(data_table)
except:
try:
data_table = './pyccapt/files/isotopeTable.h5'
dataframe = pd.read_hdf(data_table)
except Exception as e:
print("Error loading the file", e)
# Expand elements based on charge
elements = dataframe['element'].to_numpy()
isotope_number = dataframe['isotope'].to_numpy()
weight = dataframe['weight'].to_numpy()
abundance = dataframe['abundance'].to_numpy()
elements = np.repeat(elements, charge)
isotope_number = np.repeat(isotope_number, charge)
weights = np.repeat(weight, charge)
abundance = np.repeat(abundance, charge)
charge_list = np.array([i % charge + 1 for i in range(len(weights))])
weights = weights / charge_list
# Filter elements by abundance threshold
abundance_threshold *= 100
mask_abundanc = abundance > abundance_threshold
elements = elements[mask_abundanc]
isotope_number = isotope_number[mask_abundanc]
weights = weights[mask_abundanc]
abundance = abundance[mask_abundanc]
target_elements = target_elements.split(',')
target_elements = [s.replace(' ', '') for s in target_elements]
index_elements = []
for i in range(len(target_elements)):
index_elements.append([index for index, element in enumerate(elements) if element == target_elements[i]])
idxs = [item for sublist in index_elements for item in sublist]
selected_elements = elements[idxs]
selected_isotope_number = isotope_number[idxs]
selected_weights = weights[idxs]
selected_abundance = abundance[idxs]
selected_charge_list = charge_list[idxs]
# Create LaTeX formatted element symbols
element_symbols = []
for i in range(len(idxs)):
formula = ''
formula += '{}^'
formula += '{%s}' % selected_isotope_number[i]
formula += '%s' % selected_elements[i]
if selected_charge_list[i] > 1:
formula = r'$' + formula + '^{%s+}$' % selected_charge_list[i]
else:
formula = r'$' + formula + '^{+}$'
element_symbols.append(formula)
selected_elements = [[item] for item in selected_elements]
complex = np.ones(len(idxs), dtype=int)
complex = [[item] for item in complex]
selected_isotope_number = [[item] for item in selected_isotope_number]
complex = [[np.uint32(float(value[0]))] for value in complex]
selected_isotope_number = [[np.uint32(float(value[0]))] for value in selected_isotope_number]
selected_charge_list = [np.uint32(value) for value in selected_charge_list]
# Create DataFrame
df = pd.DataFrame(
{
'ion': element_symbols,
'mass': selected_weights,
'element': selected_elements,
'complex': complex,
'isotope': selected_isotope_number,
'charge': selected_charge_list,
'abundance': selected_abundance,
}
)
# Round the abundance column to 4 decimal places
df['abundance'] = df['abundance'].round(4)
# Divide all elements in abundance by 100
df['abundance'] = df['abundance'] / 100
df['mass'] = df['mass'].round(4)
return rank_candidate_assignments(df, variables=variables)
[docs]
def molecule_manual(target_element, charge, latex=True, variables=None):
"""
Generate a list of isotopes for a given target element.
Args:
target_element (str): The target element to find isotopes for.
charge (int): The charge of the target element.
latex (bool, optional): Whether to generate LaTeX representation of formulas. Defaults to True.
variables (object, optional): The variables object. Defaults to None.
Returns:
pd.DataFrame: A DataFrame containing the list of isotopes with their weights and abundances.
"""
try:
isotopeTableFile = '../../../files/isotopeTable.h5'
dataframe = pd.read_hdf(isotopeTableFile, mode='r')
except:
try:
isotopeTableFile = './pyccapt/files/isotopeTable.h5'
dataframe = pd.read_hdf(isotopeTableFile, mode='r')
except Exception as e:
print("Error loading the file", e)
target_element = fix_parentheses(target_element)
elements = dataframe['element'].to_numpy()
isotope_number = dataframe['isotope'].to_numpy()
abundance = dataframe['abundance'].to_numpy()
weight = dataframe['weight'].to_numpy()
# Extract numbers enclosed in curly braces and store them in a list
isotope_list = [int(match.group(1)) for match in re.finditer(r'{(\d+)}', target_element)]
# Extract uppercase letters and store them in a list
element_list = re.findall(r'[A-Z][a-z]*', target_element)
# Extract numbers that follow the uppercase letters and store them in a list
complexity_list = [int(match[1]) for match in re.findall(r'([A-Z][a-z]*)(\d+)', target_element)]
total_weight = 0
abundance_c = 1
for i, isotop in enumerate(isotope_list):
index = np.where(isotope_number == isotop)
total_weight += weight[index] * complexity_list[i]
abundance_c *= (abundance[index] / 100) ** complexity_list[i]
total_weight = total_weight / charge
if latex:
formula = ''
for i in range(len(isotope_list)):
isotope = isotope_list[i]
element = element_list[i]
comp = complexity_list[i]
formula += '{}^'
formula += '{%s}' % isotope
formula += '%s' % element
if comp != 1:
formula += '_{%s}' % comp
if charge > 1:
formula = r'$(' + formula + ')^{%s+}$' % charge
else:
formula = r'$' + formula + ')^{+}$'
else:
formula = target_element
# Convert float_list to a list of uint32 values
complexity_list = [np.uint32(value) for value in complexity_list]
isotope_list = [np.uint32(value) for value in isotope_list]
charge = np.uint32(charge)
element_list = [element_list]
complexity_list = [complexity_list]
isotope_list = [isotope_list]
charge = [charge]
df = pd.DataFrame(
{
'ion': formula,
'mass': total_weight,
'element': element_list,
'complex': complexity_list,
'isotope': isotope_list,
'charge': charge,
'abundance': abundance_c,
}
)
# Round the abundance column to 4 decimal places
df['abundance'] = df['abundance'].round(4)
df['mass'] = df['mass'].round(4)
return rank_candidate_assignments(df, variables=variables)
[docs]
def molecule_create(element_list, max_complexity, charge, abundance_threshold, variables=None, latex=True):
"""
Generate a list of isotopes for a given target element.
Args:
element_list (str): The target element to find isotopes for.
max_complexity (int): The maximum complexity of the molecule.
charge (int): The charge of the target element.
abundance_threshold (float): The abundance threshold for filtering isotopes.
variables (object, optional): The variables object. Defaults to None.
latex (bool, optional): Whether to generate LaTeX representation of formulas. Defaults to True.
Returns:
pd.DataFrame: A DataFrame containing the list of isotopes with their weights and abundances.
"""
try:
isotopeTableFile = '../../../files/isotopeTable.h5'
dataframe = data_tools.read_range(isotopeTableFile)
except:
try:
isotopeTableFile = './pyccapt/files/isotopeTable.h5'
dataframe = data_tools.read_range(isotopeTableFile)
except Exception as e:
print("Error loading the file", e)
elements = dataframe['element'].to_numpy()
isotope_number = dataframe['isotope'].to_numpy()
abundance = dataframe['abundance'].to_numpy()
weight = dataframe['weight'].to_numpy()
element_list = element_list.split(',')
element_list = [s.replace(' ', '') for s in element_list]
indices_elements = np.where(np.isin(elements, [element_list]))
selected_elements = elements[indices_elements[0]]
selected_isotope_number = isotope_number[indices_elements[0]]
selected_weights = weight[indices_elements[0]]
selected_abundance = abundance[indices_elements[0]]
# Create a list of elements with their respective isotope numbers, weights, and abundances
element_data = [
{'element': elem, 'isotope': iso, 'weight': w, 'abundance': ab}
for elem, iso, w, ab in zip(selected_elements, selected_isotope_number, selected_weights, selected_abundance)
]
# Initialize lists to store results
combinations = []
combinations_list = []
combination_weights = []
combination_isotopes = []
combination_abundances = []
combination_charge = []
combination_complexity = []
combination_formula = []
# Generate all possible combinations with complexity 3
for comploex in range(1, max_complexity + 1):
for combo in itertools.product(element_data, repeat=comploex):
combo_elements = [elem['element'] for elem in combo]
combo_weights = [elem['weight'] for elem in combo]
combo_abundances = [elem['abundance'] for elem in combo]
combo_weight = sum(combo_weights)
combo_abundance = 1.0
for ab in combo_abundances:
combo_abundance *= ab / 100
for i in range(charge):
combinations.append(combo_elements)
combinations_list.append(combo_elements)
combination_weights.append(combo_weight / (i + 1))
combination_isotopes.append([elem['isotope'] for elem in combo])
combination_abundances.append(combo_abundance)
combination_charge.append(i + 1)
for i in range(len(combinations)):
new_combination, new_isotopes, complexity = transform_combination_and_isotopes(
combinations[i], combination_isotopes[i]
)
combinations[i] = new_combination
combination_isotopes[i] = new_isotopes
combination_complexity.append(complexity)
charge = combination_charge[i]
if latex:
formula = ''
for i in range(len(new_isotopes)):
isotope = new_isotopes[i]
element = new_combination[i]
comp = complexity[i]
formula += '{}^'
formula += '{%s}' % isotope
formula += '%s' % element
if comp != 1:
formula += '_{%s}' % comp
if charge > 1:
formula = r'$' + formula + '^{%s+}$' % charge
else:
formula = r'$' + formula + '^{+}$'
else:
element_counts = {}
chemical_formula = ''
for element in element_list:
if element in element_counts:
element_counts[element] += 1
else:
element_counts[element] = 1
for element, count in element_counts.items():
chemical_formula += element
if count > 1:
chemical_formula += str(count)
formula = chemical_formula
combination_formula.append(formula)
combination_complexity = [[np.uint32(x) for x in sub_list] for sub_list in combination_complexity]
combination_isotopes = [[np.uint32(x) for x in sub_list] for sub_list in combination_isotopes]
combination_charge = [np.uint32(value) for value in combination_charge]
# Create DataFrame
df = pd.DataFrame(
{
'ion': combination_formula,
'mass': combination_weights,
'element': combinations,
'complex': combination_complexity,
'isotope': combination_isotopes,
'charge': combination_charge,
'abundance': combination_abundances,
}
)
df = df[df['abundance'] > abundance_threshold]
# Round the abundance column to 4 decimal places
df['abundance'] = df['abundance'].round(4)
df['mass'] = df['mass'].round(4)
return rank_candidate_assignments(df, variables=variables)
[docs]
def ranging_dataset_create(variables, row_index, mass_ion):
"""
This function is used to create the ranging dataset
Arg:
variables (class): The class of the variables
row_index (int): The index of the selected row
mass_ion (float): The mass of the element
Returns:
None
"""
if len(variables.range_data_backup) == 0:
print('The dataframe of elements is empty')
print('First press the button to find the closest elements')
else:
if row_index >= 0:
row = variables.range_data_backup.iloc[row_index]
selected_row = [
row['ion'],
float(row['mass']),
list(row['element']),
list(row['complex']),
list(row['isotope']),
np.uint32(row['charge']),
]
else:
selected_row = ['un', mass_ion, ['unranged'], [0], [0], 0]
fake = Factory.create()
try:
data_table = '../../../files/color_scheme.h5'
dataframe = data_tools.read_range(data_table)
except:
try:
data_table = './pyccapt/files/color_scheme.h5'
dataframe = data_tools.read_range(data_table)
except Exception as e:
print("Error loading the file", e)
element_selec = selected_row[2]
if len(element_selec) == 1:
element_selec = element_selec[0]
try:
color_rgb = dataframe[dataframe['ion'].str.contains(element_selec, na=False)].to_numpy().tolist()
color = matplotlib.colors.to_hex([color_rgb[0][1], color_rgb[0][2], color_rgb[0][3]])
print(f"Color: {color}")
except Exception as e:
print(f"An error occurred: {e}")
print('The element is not color list')
color = fake.hex_color()
else:
color = fake.hex_color()
if not variables.h_line_pos:
print('The h_line_pos is empty')
print('First specify the left and right boundary for the selected peak')
else:
# Find the closest h_line that is smaller than mass
smaller_candidates = [x for x in variables.h_line_pos if x < mass_ion]
# Find the closest h_line that is bigger than mass
bigger_candidates = [x for x in variables.h_line_pos if x > mass_ion]
if not smaller_candidates or not bigger_candidates:
print('Could not find valid lower and upper peak boundaries around the selected mass.')
print('Please place two range lines around the target peak first.')
return
smaller_h_line = max(smaller_candidates)
bigger_h_line = min(bigger_candidates)
def generate_name(elements, counts):
return ".".join(f"{el}{ct}" for el, ct in zip(elements, counts))
name = generate_name(selected_row[2], selected_row[3])
selected_row.insert(0, name)
selected_row.insert(3, mass_ion)
selected_row.insert(4, smaller_h_line)
selected_row.insert(5, bigger_h_line)
selected_row.insert(6, color)
# Add the row to the DataFrame using the .loc method
selected_row[9] = np.uint32(selected_row[9])
print(f"Selected row: {selected_row}")
variables.range_data.loc[len(variables.range_data)] = selected_row
[docs]
def display_color(color):
"""
This function is used to display the color in the table
Arg:
color (str): The color in hex format
Returns:
str: The color in hex format
"""
color = str(color).strip()
if color and not color.startswith('#') and re.fullmatch(r'[A-Fa-f0-9]{6}', color):
color = f'#{color}'
return f'background-color: {color}; width: 50px; height: 20px;'