Skip to content

Commit f011b7c

Browse files
committed
Small fixes to database interface and associated metadata to deal with broken tests and different names of fields in msp
1 parent 3da6f71 commit f011b7c

5 files changed

Lines changed: 126 additions & 89 deletions

File tree

corems/molecular_id/factory/EI_SQL.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,6 @@ class MetaboliteMetadata:
216216
"""
217217

218218
id: int
219-
name: Optional[str]=None
220219
cas: str
221220
inchikey: str
222221
inchi: str
@@ -227,6 +226,7 @@ class MetaboliteMetadata:
227226
iupac_name: str
228227
traditional_name: str
229228
common_name: str
229+
name: Optional[str]=None
230230
formula: Optional[str]=None
231231
pubchem_id: Optional[str]=None
232232
refmet_id: Optional[str]=None

corems/molecular_id/factory/lipid_molecular_metadata.py

Lines changed: 27 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,39 +5,33 @@
55

66
from .EI_SQL import MetaboliteMetadata
77

8-
98
@dataclass
109
class LipidMetadata(MetaboliteMetadata):
11-
"""Dataclass for the Lipid Metadata
12-
13-
Parameters
14-
----------
15-
name : str
16-
The name of the lipid, using the LIPID MAPS nomenclature
17-
casno : str
18-
The CAS number of the lipid
19-
formula : str
20-
The molecular formula of the lipid
21-
pubchem_id : str
22-
The PubChem ID of the lipid
23-
structure_level : str
24-
The structure level of the lipid, following the LIPID MAPS classification
25-
lipid_summed_name : str
26-
The summed name of the lipid, aka lipid species,
27-
following the LIPID MAPS classification
28-
lipid_subclass : str
29-
The subclass of the lipid, following the LIPID MAPS classification
30-
lipid_class : str
31-
The class of the lipid, following the LIPID MAPS classification
32-
lipid_category : str
33-
The category of the lipid, following the LIPID MAPS classification
34-
"""
35-
36-
name: str
37-
casno: str
38-
structure_level: str
10+
def __init__(self, casno: str, structure_level: str, lipid_summed_name: str, lipid_subclass: str, lipid_class: str, lipid_category: str, **kwargs):
11+
"""
12+
Initialize LipidMetadata with specific attributes and pass additional arguments to the superclass.
3913
40-
lipid_summed_name: str
41-
lipid_subclass: str
42-
lipid_class: str
43-
lipid_category: str
14+
Parameters
15+
----------
16+
casno : str
17+
The CAS number of the lipid
18+
structure_level : str
19+
The structure level of the lipid
20+
lipid_summed_name : str
21+
The summed name of the lipid
22+
lipid_subclass : str
23+
The subclass of the lipid
24+
lipid_class : str
25+
The class of the lipid
26+
lipid_category : str
27+
The category of the lipid
28+
kwargs : dict
29+
Additional arguments for the superclass
30+
"""
31+
super().__init__(**kwargs)
32+
self.casno = casno
33+
self.structure_level = structure_level
34+
self.lipid_summed_name = lipid_summed_name
35+
self.lipid_subclass = lipid_subclass
36+
self.lipid_class = lipid_class
37+
self.lipid_category = lipid_category

corems/molecular_id/search/database_interfaces.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,33 +1308,52 @@ def get_metabolomics_spectra_library(
13081308
"kegg_id": "kegg",
13091309
"refmet_name": "common_name",
13101310
"molecular_formula": "formula",
1311+
"gnps_spectra_id":"id",
1312+
"precursormz": "precursor_mz",
1313+
"precursortype":"ion_type"
13111314
}
13121315
db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
1316+
db_df["molecular_data_id"] = db_df["inchikey"]
1317+
1318+
1319+
1320+
# Check if the resulting dataframe has the required columns for the flash entropy search
1321+
required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
1322+
for col in required_columns:
1323+
if col not in db_df.columns:
1324+
raise ValueError(
1325+
f"Input field on MSP must contain '{col}' column for FlashEntropy search."
1326+
)
13131327

13141328
# Pull out the metabolite metadata from the dataframe and put it into a different dataframe
13151329
# First get a list of the possible attributes of the MetaboliteMetadata dataclass
13161330
metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
1331+
# Replace id with molecular_data_id in metabolite_metadata_keys
1332+
metabolite_metadata_keys = [
1333+
"molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
1334+
]
13171335
metabolite_metadata_df = db_df[
13181336
db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
13191337
].copy()
13201338

1321-
# Make unique and use inchikey as the id/index
1322-
metabolite_metadata_df.drop_duplicates(subset=["inchikey"], inplace=True)
1323-
metabolite_metadata_df["id"] = metabolite_metadata_df["inchikey"]
1339+
# Make unique and recast the id column for metabolite metadata
1340+
metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
1341+
metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
13241342

13251343
# Convert to a dictionary using the inchikey as the key
1326-
metabolite_metadata_dict = metabolite_metadata_df.set_index("id").to_dict(
1344+
metabolite_metadata_dict = metabolite_metadata_df.to_dict(
13271345
orient="records"
13281346
)
13291347
metabolite_metadata_dict = {
1330-
v["inchikey"]: self._dict_to_dataclass(v, MetaboliteMetadata)
1348+
v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
13311349
for v in metabolite_metadata_dict
13321350
}
13331351

13341352
# Remove the metabolite metadata columns from the original dataframe
13351353
for key in metabolite_metadata_keys:
1336-
if key in db_df.columns:
1337-
db_df.drop(columns=key, inplace=True)
1354+
if key != "molecular_data_id":
1355+
if key in db_df.columns:
1356+
db_df.drop(columns=key, inplace=True)
13381357

13391358
# Format the spectral library
13401359
format_func = self._get_format_func(format)

corems/molecular_id/search/lcms_spectral_search.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,13 @@ def get_more_match_quals(
6161
6262
"""
6363

64-
# Get the original mz values from the library entry
65-
lib_mzs = np.array(
66-
re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float
67-
).reshape(-1, 2)[:, 0]
64+
if "mz" in lib_entry.keys():
65+
# Get the original mz values from the library entry
66+
lib_mzs = np.array(
67+
re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float
68+
).reshape(-1, 2)[:, 0]
69+
elif "peaks" in lib_entry.keys() and lib_entry["peaks"] is not None:
70+
lib_mzs = lib_entry["peaks"][:, 0]
6871

6972
# Get count and fraction of peaks in query that are in lib entry
7073
query_in_lib = 0

support_code/nmdc/metabolomics/lcms_metabolomics_workflow.py

Lines changed: 65 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,54 +4,13 @@
44
from pathlib import Path
55
from multiprocessing import Pool
66
from corems.molecular_id.search.database_interfaces import MSPInterface
7+
from corems.mass_spectra.input.corems_hdf5 import ReadCoreMSHDFMassSpectra
78

89
from support_code.nmdc.lipidomics.lipidomics_workflow import (
9-
instantiate_lcms_obj,
10-
set_params_on_lcms_obj,
11-
check_scan_translator,
12-
add_mass_features,
13-
molecular_formula_search,
14-
export_results,
1510
run_lipid_sp_ms1,
11+
process_ms2
1612
)
1713

18-
19-
def run_lcms_metabolomics_workflow(
20-
file_dir,
21-
out_dir,
22-
params_toml,
23-
msp_file_path,
24-
scan_translator=None,
25-
verbose=True,
26-
cores=1,
27-
):
28-
# Make output dir and get list of files to process
29-
out_dir.mkdir(parents=True, exist_ok=True)
30-
files_list = list(file_dir.glob("*.raw"))
31-
out_paths_list = [out_dir / f.stem for f in files_list]
32-
33-
# Prepare search databases for ms2 search
34-
my_msp_FE = prepare_metadata(msp_file_path)
35-
36-
# Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export temp results
37-
# Note that this is exactly the same as the lipidomics workflow
38-
if cores == 1 or len(files_list) == 1:
39-
for file_in, file_out in list(zip(files_list, out_paths_list)):
40-
print(f"Processing {file_in}")
41-
run_lipid_sp_ms1(
42-
file_in=str(file_in),
43-
out_path=str(file_out),
44-
params_toml=params_toml,
45-
scan_translator=scan_translator,
46-
verbose=verbose,
47-
return_mzs=False,
48-
)
49-
elif cores > 1:
50-
raise ValueError(
51-
"Parallel processing is not yet supported for LCMS metabolomics workflow."
52-
)
53-
54-
5514
def prepare_metadata(msp_file_path):
5615
print("Parsing MSP file...")
5716
my_msp = MSPInterface(file_path=msp_file_path)
@@ -63,7 +22,7 @@ def prepare_metadata(msp_file_path):
6322
msp_positive, metabolite_metadata_positive = (
6423
my_msp.get_metabolomics_spectra_library(
6524
polarity="positive",
66-
format="df",
25+
format="flashentropy",
6726
normalize=True,
6827
fe_kwargs={
6928
"normalize_intensity": True,
@@ -98,6 +57,68 @@ def prepare_metadata(msp_file_path):
9857

9958
return metadata
10059

60+
def run_ms2_search(out_path, metadata, scan_translator=None):
61+
"""Run ms2 spectral search and export final results
62+
63+
Parameters
64+
----------
65+
out_path : str or Path
66+
Path to output file
67+
metadata : dict
68+
Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively
69+
70+
Returns
71+
-------
72+
None, runs ms2 spectral search and exports final results
73+
"""
74+
# Read in the intermediate results
75+
out_path = Path(out_path)
76+
out_path_hdf5 = str(out_path) + ".corems/" + out_path.stem + ".hdf5"
77+
parser = ReadCoreMSHDFMassSpectra(out_path_hdf5)
78+
myLCMSobj = parser.get_lcms_obj()
79+
process_ms2(myLCMSobj, metadata, scan_translator=scan_translator)
80+
81+
def run_lcms_metabolomics_workflow(
82+
file_dir,
83+
out_dir,
84+
params_toml,
85+
msp_file_path,
86+
scan_translator=None,
87+
verbose=True,
88+
cores=1,
89+
):
90+
# Make output dir and get list of files to process
91+
out_dir.mkdir(parents=True, exist_ok=True)
92+
files_list = list(file_dir.glob("*.raw"))
93+
out_paths_list = [out_dir / f.stem for f in files_list]
94+
95+
# Prepare search databases for ms2 search
96+
my_msp_FE = prepare_metadata(msp_file_path)
97+
98+
# Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export temp results
99+
# Note that this is exactly the same as the lipidomics workflow
100+
if cores == 1 or len(files_list) == 1:
101+
for file_in, file_out in list(zip(files_list, out_paths_list)):
102+
print(f"Processing {file_in}")
103+
run_lipid_sp_ms1(
104+
file_in=str(file_in),
105+
out_path=str(file_out),
106+
params_toml=params_toml,
107+
scan_translator=scan_translator,
108+
verbose=verbose,
109+
return_mzs=False,
110+
)
111+
#TODO KRH: No need to save hdf5 and re-open, can combine sp and ms2 search with lcms_obj in memory
112+
run_ms2_search(
113+
out_path=str(file_out),
114+
metadata=my_msp_FE,
115+
scan_translator=scan_translator,
116+
)
117+
elif cores > 1:
118+
raise ValueError(
119+
"Parallel processing is not yet supported for LCMS metabolomics workflow."
120+
)
121+
101122

102123
if __name__ == "__main__":
103124
# Set input variables to run

0 commit comments

Comments
 (0)