ProteoWizard
Serializer_pepXML_Test.cpp
Go to the documentation of this file.
1 //
2 // $Id: Serializer_pepXML_Test.cpp 4129 2012-11-20 00:05:37Z chambm $
3 //
4 //
5 // Original author: Matt Chambers <matt.chambers .@. vanderbilt.edu>
6 //
7 // Copyright 2010 Vanderbilt University - Nashville, TN 37232
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
21 
22 
23 #include "Serializer_pepXML.hpp"
24 #include "Diff.hpp"
25 #include "References.hpp"
26 #include "examples.hpp"
31 #include "TextWriter.hpp"
32 #include "boost/range/adaptor/transformed.hpp"
33 #include "boost/range/algorithm/max_element.hpp"
34 #include "boost/range/algorithm/min_element.hpp"
35 #include <cstring>
36 
37 
38 using namespace pwiz::identdata;
39 using namespace pwiz::identdata::examples;
40 using namespace pwiz::util;
41 namespace proteome = pwiz::proteome;
42 
43 ostream* os_ = 0;
44 
46 {
47  typedef int result_type;
48  int operator()(const EnzymePtr& x) const {return x->terminalSpecificity;}
49 };
50 
52 {
53  typedef int result_type;
54  int operator()(const EnzymePtr& x) const {return x->missedCleavages;}
55 };
56 
58 {
59  mzid.bibliographicReference.clear();
60  mzid.analysisSampleCollection.samples.clear();
61  mzid.auditCollection.clear();
62  mzid.provider = Provider();
63  mzid.dataCollection.inputs.sourceFile.clear();
64 
65  BOOST_FOREACH(AnalysisSoftwarePtr& as, mzid.analysisSoftwareList)
66  {
67  as->URI.clear();
68  as->customizations.clear();
69  as->contactRolePtr.reset();
70  }
71 
73 
74  // pepXML only provides a single min_number_termini and max_num_internal_cleavages for all enzymes
75  int minSpecificity = *boost::range::min_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_specificity()));
76  int maxMissedCleavages = *boost::range::max_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_missedCleavages()));
77  BOOST_FOREACH(const EnzymePtr& ez, sip.enzymes.enzymes)
78  {
79  ez->terminalSpecificity = (proteome::Digestion::Specificity) minSpecificity;
80  ez->missedCleavages = maxMissedCleavages;
81  }
82 
83  // pepXML doesn't map these elements
84  sip.massTable.clear();
85  sip.threshold.clear();
86  sip.databaseFilters.clear();
87  sip.databaseTranslation.reset();
88 
89  // pepXML doesn't map these attributes
90  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->name.clear();
91  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->version.clear();
92  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->releaseDate.clear();
93  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->databaseName.clear();
94 
95  // pepXML doesn't reliably store location or file format
96  string& location = mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->location;
97  location = BFS_STRING(bfs::path(location).replace_extension("").filename());
98  mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->fileFormat = CVParam();
99 
100  string& location2 = mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->location;
101  location2 = BFS_STRING(bfs::path(location2).replace_extension("").filename());
102 
103  // pepXML doesn't support protein sequences
104  BOOST_FOREACH(DBSequencePtr& dbSequence, mzid.sequenceCollection.dbSequences)
105  {
106  dbSequence->seq.clear();
107  dbSequence->length = 0;
108  dbSequence->id = "DBSeq_" + dbSequence->accession;
109  }
110 
111  // pepXML can only support one mass type (we pick the max mass in case one of them is 0)
112  BOOST_FOREACH(PeptidePtr& peptide, mzid.sequenceCollection.peptides)
113  BOOST_FOREACH(ModificationPtr& mod, peptide->modification)
114  mod->monoisotopicMassDelta = mod->avgMassDelta = max(mod->monoisotopicMassDelta, mod->avgMassDelta);
115 
116  // pepXML doesn't support fragment metadata
117  mzid.dataCollection.analysisData.spectrumIdentificationList[0]->fragmentationTable.clear();
118 
119  BOOST_FOREACH(SpectrumIdentificationResultPtr& sir, mzid.dataCollection.analysisData.spectrumIdentificationList[0]->spectrumIdentificationResult)
120  BOOST_FOREACH(SpectrumIdentificationItemPtr& sii, sir->spectrumIdentificationItem)
121  {
122  // pepXML doesn't support fragment metadata or mass tables
123  sii->fragmentation.clear();
124  sii->massTablePtr.reset();
125 
126  for (size_t i=0; i < sii->peptideEvidencePtr.size(); ++i)
127  {
128  PeptideEvidence& pe = *sii->peptideEvidencePtr[i];
129 
130  // pepXML does not store peptide start and end offsets
131  pe.start = pe.end = 0;
132 
133  // pepXML's alternative_proteins do not store prev/next AA or missed cleavages
134  if (i > 0)
135  pe.pre = pe.post = '?';
136  }
137  }
138 
139  // pepXML doesn't have protein assembly
142 }
143 
144 void testTranslation(const string& str)
145 {
146  // test that search engine name is written using preferred name
147  unit_assert(bal::contains(str, "search_engine=\"Mascot\""));
148 
149  // test that score names are written using preferred name
150  unit_assert(bal::contains(str, "name=\"ionscore\""));
151  unit_assert(bal::contains(str, "name=\"homologyscore\""));
152  unit_assert(bal::contains(str, "name=\"identityscore\""));
153  unit_assert(bal::contains(str, "name=\"expect\""));
154  unit_assert(bal::contains(str, "name=\"an extra score\""));
155 
156  // test that nativeID is preserved
157  unit_assert(bal::contains(str, "spectrumNativeID=\"controllerType=0 controllerNumber=1 scan=420\""));
158 }
159 
161 {
162  if (os_) *os_ << "begin testSerialize" << endl;
163 
164  Serializer_pepXML serializer(config);
165  ostringstream oss;
166  serializer.write(oss, mzid, "tiny.pepXML");
167 
168  if (os_) *os_ << "oss:\n" << oss.str() << endl;
169  if (config.readSpectrumQueries)
170  testTranslation(oss.str());
171 
172  shared_ptr<istringstream> iss(new istringstream(oss.str()));
173  IdentData mzid2;
174  serializer.read(iss, mzid2);
175 
176  References::resolve(mzid2);
177 
178  Diff<IdentData, DiffConfig> diff(mzid, mzid2);
179  if (os_ && diff) *os_ << diff << endl;
180  unit_assert(!diff);
181 }
182 
184 {
185  IdentData mzid;
187  stripUnmappedMetadata(mzid);
189 
190 
191  // test non-specific enzyme
192  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
193  EnzymePtr noEnzyme(new Enzyme);
194  noEnzyme->id = "ENZ_1";
195  noEnzyme->cTermGain = "OH";
196  noEnzyme->nTermGain = "H";
197  noEnzyme->missedCleavages = 2;
198  noEnzyme->minDistance = 1;
199  noEnzyme->terminalSpecificity = proteome::Digestion::NonSpecific;
200  noEnzyme->siteRegexp = "(?<=[KR])";
201  noEnzyme->enzymeName.set(MS_Trypsin_P);
202  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(noEnzyme);
204 
205 
206  // test sense="N" enzymes
207  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
208  EnzymePtr aspN(new Enzyme);
209  aspN->id = "ENZ_1";
210  aspN->cTermGain = "OH";
211  aspN->nTermGain = "H";
212  aspN->missedCleavages = 2;
213  aspN->minDistance = 1;
214  aspN->terminalSpecificity = proteome::Digestion::FullySpecific;
215  aspN->siteRegexp = "(?=[BD])";
216  aspN->enzymeName.set(MS_Asp_N);
217  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(aspN);
219 
220  aspN->missedCleavages = 4;
221  aspN->minDistance = 2;
222  aspN->terminalSpecificity = proteome::Digestion::SemiSpecific;
223  aspN->siteRegexp = "(?=[BND])";
224  aspN->enzymeName.clear();
225  aspN->enzymeName.userParams.push_back(UserParam("custom"));
227 
228 
229  // test with readSpectrumQueries == false
230 
231  // clear the original SequenceCollection
232  mzid.sequenceCollection.dbSequences.clear();
233  mzid.sequenceCollection.peptides.clear();
234  mzid.sequenceCollection.peptideEvidence.clear();
235 
236  // clear the original analysis data
237  mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->spectrumIDFormat = CVParam();
238  mzid.analysisCollection.spectrumIdentification[0]->spectrumIdentificationListPtr.reset();
241 
243 }
244 
246 {
247  PepXMLSpecificity result;
248  Enzyme ez;
249 
251  result = pepXMLSpecificity(ez);
252  unit_assert_operator_equal("C", result.sense);
253  unit_assert_operator_equal("KR", result.cut);
254  unit_assert_operator_equal("P", result.no_cut);
255 
256  ez.enzymeName.clear();
258  result = pepXMLSpecificity(ez);
259  unit_assert_operator_equal("C", result.sense);
260  unit_assert_operator_equal("KR", result.cut);
262 
263  ez.enzymeName.clear();
264  ez.enzymeName.userParams.push_back(UserParam("trypsin/p"));
265  result = pepXMLSpecificity(ez);
266  unit_assert_operator_equal("C", result.sense);
267  unit_assert_operator_equal("KR", result.cut);
269 
270  ez.enzymeName.clear();
271  ez.name = "trypsin/p";
272  result = pepXMLSpecificity(ez);
273  unit_assert_operator_equal("C", result.sense);
274  unit_assert_operator_equal("KR", result.cut);
276 
277  ez.name.clear();
278  ez.enzymeName.set(MS_Asp_N);
279  result = pepXMLSpecificity(ez);
280  unit_assert_operator_equal("N", result.sense);
281  unit_assert_operator_equal("BD", result.cut);
283 
284  ez.enzymeName.clear();
286  result = pepXMLSpecificity(ez);
287  unit_assert_operator_equal("C", result.sense);
288  unit_assert_operator_equal("KR", result.cut);
289  unit_assert_operator_equal("P", result.no_cut);
290 
292  result = pepXMLSpecificity(ez);
293  unit_assert_operator_equal("C", result.sense);
294  unit_assert_operator_equal("KR", result.cut);
296 
298  result = pepXMLSpecificity(ez);
299  unit_assert_operator_equal("N", result.sense);
300  unit_assert_operator_equal("BD", result.cut);
302 
303 
304  // REMEMBER: update the pepXMLSpecificity function when new CV enzymes are added
305  bool allCleavageAgentsHandled = true;
306  ez.siteRegexp.clear();
308  try
309  {
310  ez.enzymeName.clear();
311  ez.enzymeName.set(cleavageAgent);
312  result = pepXMLSpecificity(ez);
313  }
314  catch (exception& e)
315  {
316  cerr << e.what() << endl;
317  allCleavageAgentsHandled = false;
318  }
319  unit_assert(allCleavageAgentsHandled);
320 
321 
322  ez.siteRegexp = "(?<=[QWERTY])(?=[QWERTY])";
323  result = pepXMLSpecificity(ez);
324  unit_assert_operator_equal("C", result.sense);
325  unit_assert_operator_equal("QWERTY", result.cut);
326  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.no_cut);
327 
328  ez.siteRegexp = "(?<![QWERTY])(?![QWERTY])";
329  result = pepXMLSpecificity(ez);
330  unit_assert_operator_equal("C", result.sense);
331  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
332  unit_assert_operator_equal("QWERTY", result.no_cut);
333 
334  ez.siteRegexp = "(?<=[QWERTY])";
335  result = pepXMLSpecificity(ez);
336  unit_assert_operator_equal("C", result.sense);
337  unit_assert_operator_equal("QWERTY", result.cut);
339 
340  ez.siteRegexp = "(?=[QWERTY])";
341  result = pepXMLSpecificity(ez);
342  unit_assert_operator_equal("N", result.sense);
343  unit_assert_operator_equal("QWERTY", result.cut);
345 
346  ez.siteRegexp = "(?<![QWERTY])";
347  result = pepXMLSpecificity(ez);
348  unit_assert_operator_equal("C", result.sense);
349  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
351 
352  ez.siteRegexp = "(?![QWERTY])";
353  result = pepXMLSpecificity(ez);
354  unit_assert_operator_equal("N", result.sense);
355  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
357 }
358 
359 
361 {
362  unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123.2"));
363  unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123.12"));
364  unit_assert_operator_equal("basename.2.2", stripChargeFromConventionalSpectrumId("basename.2.2.2"));
365  unit_assert_operator_equal("basename.ext.3.3", stripChargeFromConventionalSpectrumId("basename.ext.3.3.3"));
366  unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123"));
367  unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123"));
368  unit_assert_operator_equal("locus:1.1.1.123", stripChargeFromConventionalSpectrumId("locus:1.1.1.123.2"));
369  unit_assert_operator_equal("basename.123", stripChargeFromConventionalSpectrumId("basename.123"));
371 }
372 
373 
374 int main(int argc, char** argv)
375 {
376  TEST_PROLOG(argc, argv)
377 
378  try
379  {
380  if (argc>1 && !strcmp(argv[1],"-v")) os_ = &cout;
383  testSerialize();
384  }
385  catch (exception& e)
386  {
387  TEST_FAILED(e.what())
388  }
389  catch (...)
390  {
391  TEST_FAILED("Caught unknown exception.")
392  }
393 
395 }