Skip to content

Commit 1570a72

Browse files
committed
Clean up lcms worklfow exporter and example in preparation for test
1 parent 9d906bf commit 1570a72

2 files changed

Lines changed: 61 additions & 76 deletions

File tree

corems/mass_spectra/output/export.py

Lines changed: 53 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1167,9 +1167,9 @@ def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml")
11671167
"precursor_mz",
11681168
"query_spectrum_id",
11691169
]:
1170-
if k3 == "query_frag_types" or k3 == "ref_frag_types" or k3 == "database_name":
1170+
if k3 == "query_frag_types" or k3 == "ref_frag_types":
11711171
v3 = [", ".join(x) for x in v3]
1172-
if all(v3) != None:
1172+
if all(v3 is not None for v3 in v3):
11731173
array = np.array(v3)
11741174
if array.dtype.str[0:2] == "<U":
11751175
array = array.astype("S")
@@ -1403,69 +1403,37 @@ def summarize_metabolomics_report(self, ms2_annot_report):
14031403
DataFrame
14041404
The summarized metabolomics report.
14051405
"""
1406-
# Prepare metadata with regards to the molecules
1407-
if "chebi" in ms2_annot_report.columns:
1408-
ms2_annot_report["chebi"] = (
1409-
pd.to_numeric(ms2_annot_report["chebi"], errors="coerce")
1410-
.apply(lambda x: str(int(x)) if pd.notna(x) else None)
1411-
)
1412-
1413-
mol_data = ms2_annot_report.groupby(["mf_id", "ref_mol_id", "formula"]).agg(
1414-
{
1415-
"cas": lambda x: ", ".join(x.unique())
1416-
if x.notna().any() else None,
1417-
"inchikey": lambda x: ", ".join(x.unique())
1418-
if x.notna().any() else None,
1419-
"inchi": lambda x: ", ".join(x.unique())
1420-
if x.notna().any() else None,
1421-
"chebi": lambda x: ", ".join(x.unique())
1422-
if x.notna().any() else None,
1423-
"smiles": lambda x: ", ".join(x.unique())
1424-
if x.notna().any() else None,
1425-
"kegg": lambda x: ", ".join(x.unique())
1426-
if x.notna().any() else None,
1427-
"name": lambda x: ", ".join(x.unique())
1428-
if x.notna().any() else None,
1429-
"database_name": lambda x: ", ".join(x.unique())
1430-
}
1431-
).reset_index()
1432-
1406+
columns_to_drop = [
1407+
"precursor_mz",
1408+
"precursor_mz_error_ppm",
1409+
"cas",
1410+
"data_id",
1411+
"iupac_name",
1412+
"traditional_name",
1413+
"common_name",
1414+
"casno",
1415+
]
1416+
ms2_annot = ms2_annot_report.drop(
1417+
columns=[col for col in columns_to_drop if col in ms2_annot_report.columns]
1418+
)
1419+
14331420
# Prepare information about the search results, pulling out the best hit for the single report
14341421
# Group by mf_id,ref_mol_id grab row with highest entropy similarity
1435-
ms2_annot_report = ms2_annot_report.reset_index()
1422+
ms2_annot = ms2_annot.reset_index()
14361423
# Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id
1437-
ms2_annot_report["n_spectra_contributing"] = (
1438-
ms2_annot_report.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"]
1424+
ms2_annot["n_spectra_contributing"] = (
1425+
ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"]
14391426
.transform("nunique")
14401427
)
14411428
# Sort by entropy similarity
1442-
ms2_annot_report = ms2_annot_report.sort_values(
1429+
ms2_annot = ms2_annot.sort_values(
14431430
by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False]
14441431
)
1445-
best_entropy = ms2_annot_report.drop_duplicates(
1432+
best_entropy = ms2_annot.drop_duplicates(
14461433
subset=["mf_id", "ref_mol_id"], keep="first"
14471434
)
1448-
best_entropy = best_entropy[
1449-
[
1450-
"mf_id",
1451-
"ref_mol_id",
1452-
"ref_ms_id",
1453-
"entropy_similarity",
1454-
"ref_ion_type",
1455-
"ref_mz_in_query_fract",
1456-
"n_spectra_contributing"
1457-
]
1458-
].copy()
1459-
1460-
# Merge the two dataframes
1461-
output = mol_data.merge(
1462-
best_entropy,
1463-
how="left",
1464-
left_on=["mf_id", "ref_mol_id"],
1465-
right_on=["mf_id", "ref_mol_id"],
1466-
)
14671435

1468-
return output
1436+
return best_entropy
14691437

14701438
def clean_ms2_report(self, metabolite_summary):
14711439
"""Clean the MS2 report.
@@ -1486,28 +1454,36 @@ def clean_ms2_report(self, metabolite_summary):
14861454
for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"])
14871455
]
14881456

1457+
col_order = [
1458+
"mf_id",
1459+
"ion_formula",
1460+
"ref_ion_type",
1461+
"formula",
1462+
"inchikey",
1463+
"name",
1464+
"inchi",
1465+
"chebi",
1466+
"smiles",
1467+
"kegg",
1468+
"cas",
1469+
"database_name",
1470+
"ref_ms_id",
1471+
"entropy_similarity",
1472+
"ref_mz_in_query_fract",
1473+
"n_spectra_contributing",
1474+
]
1475+
14891476
# Reorder columns
14901477
metabolite_summary = metabolite_summary[
1491-
[
1492-
"mf_id",
1493-
"ion_formula",
1494-
"ref_ion_type",
1495-
"formula",
1496-
"inchikey",
1497-
"name",
1498-
"inchi",
1499-
"chebi",
1500-
"smiles",
1501-
"kegg",
1502-
"cas",
1503-
"database_name",
1504-
"ref_ms_id",
1505-
"entropy_similarity",
1506-
"ref_mz_in_query_fract",
1507-
"n_spectra_contributing",
1508-
]
1478+
[col for col in col_order if col in metabolite_summary.columns]
15091479
]
15101480

1481+
# Convert chebi (if present) to int:
1482+
if "chebi" in metabolite_summary.columns:
1483+
metabolite_summary["chebi"] = metabolite_summary["chebi"].astype(
1484+
"Int64", errors="ignore"
1485+
)
1486+
15111487
# Set the index to mf_id
15121488
metabolite_summary = metabolite_summary.set_index("mf_id")
15131489

@@ -1630,12 +1606,14 @@ def to_report(self, molecular_metadata=None):
16301606
ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
16311607
molecular_metadata=molecular_metadata
16321608
)
1633-
if ms2_annot_report is not None:
1609+
if ms2_annot_report is not None and molecular_metadata is not None:
16341610
ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report)
16351611
ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
16361612
ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
16371613
ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1638-
1614+
else:
1615+
ms2_annot_report = None
1616+
16391617
report = self.combine_reports(
16401618
mf_report=mf_report,
16411619
ms1_annot_report=ms1_annot_report,
@@ -1982,7 +1960,7 @@ def to_report(self, molecular_metadata=None):
19821960
ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
19831961
molecular_metadata=molecular_metadata
19841962
)
1985-
if ms2_annot_report is not None:
1963+
if ms2_annot_report is not None and molecular_metadata is not None:
19861964
ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
19871965
ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
19881966
ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")

support_code/nmdc/metabolomics/lcms_metabolomics_workflow.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,14 +136,21 @@ def run_lcms_metabolomics_workflow(
136136
add_mass_features(myLCMSobj, scan_translator)
137137
myLCMSobj.remove_unprocessed_data()
138138
molecular_formula_search(myLCMSobj)
139+
# Just for testing, not needed for workflow
140+
export_results(
141+
myLCMSobj,
142+
out_path=str(out_path),
143+
molecular_metadata=metadata["molecular_metadata"],
144+
final=False,
145+
)
139146
process_ms2(myLCMSobj, metadata, scan_translator=scan_translator)
140147
export_results(
141148
myLCMSobj,
142149
out_path=str(out_path),
143150
molecular_metadata=metadata["molecular_metadata"],
151+
final=True,
144152
)
145153

146-
147154
def run_lcms_metabolomics_workflow_batch(
148155
file_dir,
149156
out_dir,

0 commit comments

Comments
 (0)