@@ -1167,9 +1167,9 @@ def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml")
11671167 "precursor_mz" ,
11681168 "query_spectrum_id" ,
11691169 ]:
1170- if k3 == "query_frag_types" or k3 == "ref_frag_types" or k3 == "database_name" :
1170+ if k3 == "query_frag_types" or k3 == "ref_frag_types" :
11711171 v3 = [", " .join (x ) for x in v3 ]
1172- if all (v3 ) != None :
1172+ if all (v3 is not None for v3 in v3 ) :
11731173 array = np .array (v3 )
11741174 if array .dtype .str [0 :2 ] == "<U" :
11751175 array = array .astype ("S" )
@@ -1403,69 +1403,37 @@ def summarize_metabolomics_report(self, ms2_annot_report):
14031403 DataFrame
14041404 The summarized metabolomics report.
14051405 """
1406- # Prepare metadata with regards to the molecules
1407- if "chebi" in ms2_annot_report .columns :
1408- ms2_annot_report ["chebi" ] = (
1409- pd .to_numeric (ms2_annot_report ["chebi" ], errors = "coerce" )
1410- .apply (lambda x : str (int (x )) if pd .notna (x ) else None )
1411- )
1412-
1413- mol_data = ms2_annot_report .groupby (["mf_id" , "ref_mol_id" , "formula" ]).agg (
1414- {
1415- "cas" : lambda x : ", " .join (x .unique ())
1416- if x .notna ().any () else None ,
1417- "inchikey" : lambda x : ", " .join (x .unique ())
1418- if x .notna ().any () else None ,
1419- "inchi" : lambda x : ", " .join (x .unique ())
1420- if x .notna ().any () else None ,
1421- "chebi" : lambda x : ", " .join (x .unique ())
1422- if x .notna ().any () else None ,
1423- "smiles" : lambda x : ", " .join (x .unique ())
1424- if x .notna ().any () else None ,
1425- "kegg" : lambda x : ", " .join (x .unique ())
1426- if x .notna ().any () else None ,
1427- "name" : lambda x : ", " .join (x .unique ())
1428- if x .notna ().any () else None ,
1429- "database_name" : lambda x : ", " .join (x .unique ())
1430- }
1431- ).reset_index ()
1432-
1406+ columns_to_drop = [
1407+ "precursor_mz" ,
1408+ "precursor_mz_error_ppm" ,
1409+ "cas" ,
1410+ "data_id" ,
1411+ "iupac_name" ,
1412+ "traditional_name" ,
1413+ "common_name" ,
1414+ "casno" ,
1415+ ]
1416+ ms2_annot = ms2_annot_report .drop (
1417+ columns = [col for col in columns_to_drop if col in ms2_annot_report .columns ]
1418+ )
1419+
14331420 # Prepare information about the search results, pulling out the best hit for the single report
14341421 # Group by mf_id,ref_mol_id grab row with highest entropy similarity
1435- ms2_annot_report = ms2_annot_report .reset_index ()
1422+ ms2_annot = ms2_annot .reset_index ()
14361423 # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id
1437- ms2_annot_report ["n_spectra_contributing" ] = (
1438- ms2_annot_report .groupby (["mf_id" , "ref_mol_id" ])["query_spectrum_id" ]
1424+ ms2_annot ["n_spectra_contributing" ] = (
1425+ ms2_annot .groupby (["mf_id" , "ref_mol_id" ])["query_spectrum_id" ]
14391426 .transform ("nunique" )
14401427 )
14411428 # Sort by entropy similarity
1442- ms2_annot_report = ms2_annot_report .sort_values (
1429+ ms2_annot = ms2_annot .sort_values (
14431430 by = ["mf_id" , "ref_mol_id" , "entropy_similarity" ], ascending = [True , True , False ]
14441431 )
1445- best_entropy = ms2_annot_report .drop_duplicates (
1432+ best_entropy = ms2_annot .drop_duplicates (
14461433 subset = ["mf_id" , "ref_mol_id" ], keep = "first"
14471434 )
1448- best_entropy = best_entropy [
1449- [
1450- "mf_id" ,
1451- "ref_mol_id" ,
1452- "ref_ms_id" ,
1453- "entropy_similarity" ,
1454- "ref_ion_type" ,
1455- "ref_mz_in_query_fract" ,
1456- "n_spectra_contributing"
1457- ]
1458- ].copy ()
1459-
1460- # Merge the two dataframes
1461- output = mol_data .merge (
1462- best_entropy ,
1463- how = "left" ,
1464- left_on = ["mf_id" , "ref_mol_id" ],
1465- right_on = ["mf_id" , "ref_mol_id" ],
1466- )
14671435
1468- return output
1436+ return best_entropy
14691437
14701438 def clean_ms2_report (self , metabolite_summary ):
14711439 """Clean the MS2 report.
@@ -1486,28 +1454,36 @@ def clean_ms2_report(self, metabolite_summary):
14861454 for f , a in zip (metabolite_summary ["formula" ], metabolite_summary ["ref_ion_type" ])
14871455 ]
14881456
1457+ col_order = [
1458+ "mf_id" ,
1459+ "ion_formula" ,
1460+ "ref_ion_type" ,
1461+ "formula" ,
1462+ "inchikey" ,
1463+ "name" ,
1464+ "inchi" ,
1465+ "chebi" ,
1466+ "smiles" ,
1467+ "kegg" ,
1468+ "cas" ,
1469+ "database_name" ,
1470+ "ref_ms_id" ,
1471+ "entropy_similarity" ,
1472+ "ref_mz_in_query_fract" ,
1473+ "n_spectra_contributing" ,
1474+ ]
1475+
14891476 # Reorder columns
14901477 metabolite_summary = metabolite_summary [
1491- [
1492- "mf_id" ,
1493- "ion_formula" ,
1494- "ref_ion_type" ,
1495- "formula" ,
1496- "inchikey" ,
1497- "name" ,
1498- "inchi" ,
1499- "chebi" ,
1500- "smiles" ,
1501- "kegg" ,
1502- "cas" ,
1503- "database_name" ,
1504- "ref_ms_id" ,
1505- "entropy_similarity" ,
1506- "ref_mz_in_query_fract" ,
1507- "n_spectra_contributing" ,
1508- ]
1478+ [col for col in col_order if col in metabolite_summary .columns ]
15091479 ]
15101480
1481+ # Convert chebi (if present) to int:
1482+ if "chebi" in metabolite_summary .columns :
1483+ metabolite_summary ["chebi" ] = metabolite_summary ["chebi" ].astype (
1484+ "Int64" , errors = "ignore"
1485+ )
1486+
15111487 # Set the index to mf_id
15121488 metabolite_summary = metabolite_summary .set_index ("mf_id" )
15131489
@@ -1630,12 +1606,14 @@ def to_report(self, molecular_metadata=None):
16301606 ms2_annot_report = self .mass_spectra .mass_features_ms2_annot_to_df (
16311607 molecular_metadata = molecular_metadata
16321608 )
1633- if ms2_annot_report is not None :
1609+ if ms2_annot_report is not None and molecular_metadata is not None :
16341610 ms2_annot_report = self .summarize_metabolomics_report (ms2_annot_report )
16351611 ms2_annot_report = self .clean_ms2_report (ms2_annot_report )
16361612 ms2_annot_report = ms2_annot_report .dropna (axis = 1 , how = "all" )
16371613 ms2_annot_report = ms2_annot_report .reset_index (drop = False )
1638-
1614+ else :
1615+ ms2_annot_report = None
1616+
16391617 report = self .combine_reports (
16401618 mf_report = mf_report ,
16411619 ms1_annot_report = ms1_annot_report ,
@@ -1982,7 +1960,7 @@ def to_report(self, molecular_metadata=None):
19821960 ms2_annot_report = self .mass_spectra .mass_features_ms2_annot_to_df (
19831961 molecular_metadata = molecular_metadata
19841962 )
1985- if ms2_annot_report is not None :
1963+ if ms2_annot_report is not None and molecular_metadata is not None :
19861964 ms2_annot_report = self .summarize_lipid_report (ms2_annot_report )
19871965 ms2_annot_report = self .clean_ms2_report (ms2_annot_report )
19881966 ms2_annot_report = ms2_annot_report .dropna (axis = 1 , how = "all" )
0 commit comments