Small fixes to database interface and associated metadata to deal with broken tests and different names of fields in msp

kheal · kheal · commit f011b7cd337a · 2025-04-22T15:42:59.000-07:00
diff --git a/corems/molecular_id/factory/EI_SQL.py b/corems/molecular_id/factory/EI_SQL.py
@@ -216,7 +216,6 @@ class MetaboliteMetadata:
     """
 
     id: int
-    name: Optional[str]=None
     cas: str
     inchikey: str
     inchi: str
@@ -227,6 +226,7 @@ class MetaboliteMetadata:
     iupac_name: str
     traditional_name: str
     common_name: str
+    name: Optional[str]=None
     formula: Optional[str]=None
     pubchem_id: Optional[str]=None
     refmet_id: Optional[str]=None
diff --git a/corems/molecular_id/factory/lipid_molecular_metadata.py b/corems/molecular_id/factory/lipid_molecular_metadata.py
@@ -5,39 +5,33 @@
 
 from .EI_SQL import MetaboliteMetadata
 
-
 @dataclass
 class LipidMetadata(MetaboliteMetadata):
-    """Dataclass for the Lipid Metadata
-
-    Parameters
-    ----------
-    name : str
-        The name of the lipid, using the LIPID MAPS nomenclature
-    casno : str
-        The CAS number of the lipid
-    formula : str
-        The molecular formula of the lipid
-    pubchem_id : str
-        The PubChem ID of the lipid
-    structure_level : str
-        The structure level of the lipid, following the LIPID MAPS classification
-    lipid_summed_name : str
-        The summed name of the lipid, aka lipid species,
-        following the LIPID MAPS classification
-    lipid_subclass : str
-        The subclass of the lipid, following the LIPID MAPS classification
-    lipid_class : str
-        The class of the lipid, following the LIPID MAPS classification
-    lipid_category : str
-        The category of the lipid, following the LIPID MAPS classification
-    """
-
-    name: str
-    casno: str
-    structure_level: str
+    def __init__(self, casno: str, structure_level: str, lipid_summed_name: str, lipid_subclass: str, lipid_class: str, lipid_category: str, **kwargs):
+        """
+        Initialize LipidMetadata with specific attributes and pass additional arguments to the superclass.
 
-    lipid_summed_name: str
-    lipid_subclass: str
-    lipid_class: str
-    lipid_category: str
+        Parameters
+        ----------
+        casno : str
+            The CAS number of the lipid
+        structure_level : str
+            The structure level of the lipid
+        lipid_summed_name : str
+            The summed name of the lipid
+        lipid_subclass : str
+            The subclass of the lipid
+        lipid_class : str
+            The class of the lipid
+        lipid_category : str
+            The category of the lipid
+        kwargs : dict
+            Additional arguments for the superclass
+        """
+        super().__init__(**kwargs)
+        self.casno = casno
+        self.structure_level = structure_level
+        self.lipid_summed_name = lipid_summed_name
+        self.lipid_subclass = lipid_subclass
+        self.lipid_class = lipid_class
+        self.lipid_category = lipid_category
diff --git a/corems/molecular_id/search/database_interfaces.py b/corems/molecular_id/search/database_interfaces.py
@@ -1308,33 +1308,52 @@ def get_metabolomics_spectra_library(
                 "kegg_id": "kegg",
                 "refmet_name": "common_name",
                 "molecular_formula": "formula",
+                "gnps_spectra_id":"id",
+                "precursormz": "precursor_mz",
+                "precursortype":"ion_type"
             }
         db_df.rename(columns=metabolite_metadata_mapping, inplace=True)
+        db_df["molecular_data_id"] = db_df["inchikey"]
+
+
+
+        # Check if the resulting dataframe has the required columns for the flash entropy search
+        required_columns = ["molecular_data_id", "precursor_mz", "ion_type", "id"]
+        for col in required_columns:
+            if col not in db_df.columns:
+                raise ValueError(
+                    f"Input field on MSP must contain '{col}' column for FlashEntropy search."
+                )
 
         # Pull out the metabolite metadata from the dataframe and put it into a different dataframe
         # First get a list of the possible attributes of the MetaboliteMetadata dataclass
         metabolite_metadata_keys = list(MetaboliteMetadata.__annotations__.keys())
+        # Replace id with molecular_data_id in metabolite_metadata_keys
+        metabolite_metadata_keys = [
+            "molecular_data_id" if x == "id" else x for x in metabolite_metadata_keys
+        ]
         metabolite_metadata_df = db_df[
             db_df.columns[db_df.columns.isin(metabolite_metadata_keys)]
         ].copy()
 
-        # Make unique and use inchikey as the id/index
-        metabolite_metadata_df.drop_duplicates(subset=["inchikey"], inplace=True)
-        metabolite_metadata_df["id"] = metabolite_metadata_df["inchikey"]
+        # Make unique and recast the id column for metabolite metadata
+        metabolite_metadata_df.drop_duplicates(subset=["molecular_data_id"], inplace=True)
+        metabolite_metadata_df["id"] = metabolite_metadata_df["molecular_data_id"]
 
         # Convert to a dictionary using the inchikey as the key
-        metabolite_metadata_dict = metabolite_metadata_df.set_index("id").to_dict(
+        metabolite_metadata_dict = metabolite_metadata_df.to_dict(
             orient="records"
         )
         metabolite_metadata_dict = {
-            v["inchikey"]: self._dict_to_dataclass(v, MetaboliteMetadata)
+            v["id"]: self._dict_to_dataclass(v, MetaboliteMetadata)
             for v in metabolite_metadata_dict
         }
 
         # Remove the metabolite metadata columns from the original dataframe
         for key in metabolite_metadata_keys:
-            if key in db_df.columns:
-                db_df.drop(columns=key, inplace=True)
+            if key != "molecular_data_id":
+                if key in db_df.columns:
+                    db_df.drop(columns=key, inplace=True)
 
         # Format the spectral library
         format_func = self._get_format_func(format)
diff --git a/corems/molecular_id/search/lcms_spectral_search.py b/corems/molecular_id/search/lcms_spectral_search.py
@@ -61,10 +61,13 @@ def get_more_match_quals(
 
         """
 
-        # Get the original mz values from the library entry
-        lib_mzs = np.array(
-            re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float
-        ).reshape(-1, 2)[:, 0]
+        if "mz" in lib_entry.keys():
+            # Get the original mz values from the library entry
+            lib_mzs = np.array(
+                re.findall(r"\(([^,]+),([^)]+)\)", lib_entry["mz"]), dtype=float
+            ).reshape(-1, 2)[:, 0]
+        elif "peaks" in lib_entry.keys() and lib_entry["peaks"] is not None:
+            lib_mzs = lib_entry["peaks"][:, 0]
 
         # Get count and fraction of peaks in query that are in lib entry
         query_in_lib = 0
diff --git a/support_code/nmdc/metabolomics/lcms_metabolomics_workflow.py b/support_code/nmdc/metabolomics/lcms_metabolomics_workflow.py
@@ -4,54 +4,13 @@
 from pathlib import Path
 from multiprocessing import Pool
 from corems.molecular_id.search.database_interfaces import MSPInterface
+from corems.mass_spectra.input.corems_hdf5 import ReadCoreMSHDFMassSpectra
 
 from support_code.nmdc.lipidomics.lipidomics_workflow import (
-    instantiate_lcms_obj,
-    set_params_on_lcms_obj,
-    check_scan_translator,
-    add_mass_features,
-    molecular_formula_search,
-    export_results,
     run_lipid_sp_ms1,
+    process_ms2
 )
 
-
-def run_lcms_metabolomics_workflow(
-    file_dir,
-    out_dir,
-    params_toml,
-    msp_file_path,
-    scan_translator=None,
-    verbose=True,
-    cores=1,
-):
-    # Make output dir and get list of files to process
-    out_dir.mkdir(parents=True, exist_ok=True)
-    files_list = list(file_dir.glob("*.raw"))
-    out_paths_list = [out_dir / f.stem for f in files_list]
-
-    # Prepare search databases for ms2 search
-    my_msp_FE = prepare_metadata(msp_file_path)
-
-    # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export temp results
-    # Note that this is exactly the same as the lipidomics workflow
-    if cores == 1 or len(files_list) == 1:
-        for file_in, file_out in list(zip(files_list, out_paths_list)):
-            print(f"Processing {file_in}")
-            run_lipid_sp_ms1(
-                file_in=str(file_in),
-                out_path=str(file_out),
-                params_toml=params_toml,
-                scan_translator=scan_translator,
-                verbose=verbose,
-                return_mzs=False,
-            )
-    elif cores > 1:
-        raise ValueError(
-            "Parallel processing is not yet supported for LCMS metabolomics workflow."
-        )
-
-
 def prepare_metadata(msp_file_path):
     print("Parsing MSP file...")
     my_msp = MSPInterface(file_path=msp_file_path)
@@ -63,7 +22,7 @@ def prepare_metadata(msp_file_path):
     msp_positive, metabolite_metadata_positive = (
         my_msp.get_metabolomics_spectra_library(
             polarity="positive",
-            format="df",
+            format="flashentropy",
             normalize=True,
             fe_kwargs={
                 "normalize_intensity": True,
@@ -98,6 +57,68 @@ def prepare_metadata(msp_file_path):
 
     return metadata
 
+def run_ms2_search(out_path, metadata, scan_translator=None):
+    """Run ms2 spectral search and export final results
+
+    Parameters
+    ----------
+    out_path : str or Path
+        Path to output file
+    metadata : dict
+        Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively
+
+    Returns
+    -------
+    None, runs ms2 spectral search and exports final results
+    """
+    # Read in the intermediate results
+    out_path = Path(out_path)
+    out_path_hdf5 = str(out_path) + ".corems/" + out_path.stem + ".hdf5"
+    parser = ReadCoreMSHDFMassSpectra(out_path_hdf5)
+    myLCMSobj = parser.get_lcms_obj()
+    process_ms2(myLCMSobj, metadata, scan_translator=scan_translator)
+    
+def run_lcms_metabolomics_workflow(
+    file_dir,
+    out_dir,
+    params_toml,
+    msp_file_path,
+    scan_translator=None,
+    verbose=True,
+    cores=1,
+):
+    # Make output dir and get list of files to process
+    out_dir.mkdir(parents=True, exist_ok=True)
+    files_list = list(file_dir.glob("*.raw"))
+    out_paths_list = [out_dir / f.stem for f in files_list]
+
+    # Prepare search databases for ms2 search
+    my_msp_FE = prepare_metadata(msp_file_path)
+
+    # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export temp results
+    # Note that this is exactly the same as the lipidomics workflow
+    if cores == 1 or len(files_list) == 1:
+        for file_in, file_out in list(zip(files_list, out_paths_list)):
+            print(f"Processing {file_in}")
+            run_lipid_sp_ms1(
+                file_in=str(file_in),
+                out_path=str(file_out),
+                params_toml=params_toml,
+                scan_translator=scan_translator,
+                verbose=verbose,
+                return_mzs=False,
+            )
+            #TODO KRH: No need to save hdf5 and re-open, can combine sp and ms2 search with lcms_obj in memory
+            run_ms2_search(
+                out_path=str(file_out),
+                metadata=my_msp_FE,
+                scan_translator=scan_translator,
+            )
+    elif cores > 1:
+        raise ValueError(
+            "Parallel processing is not yet supported for LCMS metabolomics workflow."
+        )
+
 
 if __name__ == "__main__":
     # Set input variables to run