ProteoWizard
SAXParser.hpp
Go to the documentation of this file.
1 //
2 // $Id: SAXParser.hpp 3703 2012-06-15 18:23:36Z pcbrefugee $
3 //
4 //
5 // Original author: Darren Kessner <darren@proteowizard.org>
6 //
7 // Copyright 2007 Spielberg Family Center for Applied Proteomics
8 // Cedars-Sinai Medical Center, Los Angeles, California 90048
9 //
10 // Reworked for zero-copy performance by Brian Pratt, Insilicos LLC
11 // those changes Copyright 2011 Insilicos LLC All Rights Reserved
12 //
13 // Licensed under the Apache License, Version 2.0 (the "License");
14 // you may not use this file except in compliance with the License.
15 // You may obtain a copy of the License at
16 //
17 // http://www.apache.org/licenses/LICENSE-2.0
18 //
19 // Unless required by applicable law or agreed to in writing, software
20 // distributed under the License is distributed on an "AS IS" BASIS,
21 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 // See the License for the specific language governing permissions and
23 // limitations under the License.
24 //
25 
26 
27 #ifndef _SAXPARSER_HPP_
28 #define _SAXPARSER_HPP_
29 
33 #include "boost/iostreams/positioning.hpp"
34 #include <string.h>
35 #include <iosfwd>
36 #include <string>
37 #include <vector>
38 #include <assert.h>
39 #include <stdexcept>
40 
41 
42 namespace pwiz {
43 namespace minimxml {
44 
45 
46 ///
47 /// An extended SAX interface for custom XML stream parsing.
48 ///
49 /// Use cases:
50 /// - read a single element
51 /// - read a single element, aborting on a specified tag
52 /// - delegate handling of a sub-element to another handler
53 ///
54 namespace SAXParser {
55 
56 PWIZ_API_DECL size_t count_trail_ws(const char *data,size_t len); // count whitespace chars at end of data
57 PWIZ_API_DECL void unescapeXML(char *str);
58 PWIZ_API_DECL void unescapeXML(std::string &str);
59 
60 class saxstring
61 {
62  // simple string management for zero-copy saxparser
63  //
64  // not using std::string due to overhead with:
65  // reference counts
66  // exception unwinding
67  // etc etc
68  //
69  // provides for zero-copy trimming of whitespace
70  //
71 public:
72  saxstring(size_t size = 0) {
73  init(size);
74  }
75 
77  *this = rhs;
78  }
79 
80  saxstring(const std::string &rhs) {
81  init(rhs.length());
82  memcpy(data(),rhs.c_str(),rhs.length());
83  (*this)[rhs.length()] = 0; // nullterm
84  }
85 
86  void unescapeXML() {
87  if (strchr(c_str(),'&')) {
89  resize(strlen(c_str()));
90  }
91  }
92 
94  free(_data);
95  }
96 
98  init(rhs.length());
99  if (length()) {
100  memcpy(data(),rhs.c_str(),length()+1);
101  }
102  return *this;
103  }
104 
105  saxstring & operator = (const char *rhs) {
106  init(rhs ? strlen(rhs) : 0);
107  if (length()) {
108  memcpy(data(),rhs,length()+1);
109  }
110  return *this;
111  }
112 
114  if (rhs.length()) {
115  size_t oldsize = length();
116  resize(rhs.length()+oldsize);
117  memcpy(data()+oldsize,rhs.c_str(),rhs.length()+1);
118  }
119  return *this;
120  }
121 
122  saxstring & operator += (const char *rhs) {
123  size_t rhslen = rhs?strlen(rhs):0;
124  if (rhslen) {
125  size_t oldsize = length();
126  resize(rhslen+oldsize);
127  strcpy(data()+oldsize,rhs);
128  }
129  return *this;
130  }
131 
132  bool operator == (const char *c) const {
133  return c && !strcmp(c,c_str());
134  }
135 
136  bool operator == (const std::string &s) const {
137  return !strcmp(c_str(),s.c_str());
138  }
139 
140  bool operator == (const saxstring &s) const {
141  return !strcmp(c_str(),s.c_str());
142  }
143 
144  char *resize(size_t size) {
145  if (!size) {
146  _lead = 0; // empty, reclaim the start of buffer
147  }
148  size_t new_used = size + _lead; // translate to "used" space
149  if (new_used >= _capacity) {
150  _data = (char *)realloc(_data, (_used = new_used)+1);
151  if (_used && !_data) {
152  throw std::runtime_error("SAXParser: cannot allocate memory");
153  }
154  _capacity = _used;
155  } else {
156  _used = new_used;
157  }
158  _data[_used] = 0;
159  return _data;
160  }
161  void clear() {
162  resize(0);
163  }
164  inline const char *c_str() const {
165  return _data?_data+_lead:"";
166  }
167  inline char & operator [](size_t n) {
168  return *(data()+n);
169  }
170  inline size_t length() const {
171  return _used-_lead;
172  }
173  inline size_t capacity() const {
174  return _capacity;
175  }
176  void trim_trail_ws() { // remove trailing whitespace if any
177  size_t n = count_trail_ws(c_str(),length());
178  resize(length()-n);
179  }
180  // returns number of ws chars it had to eat on front end
181  int trim_lead_ws() {
182  size_t n=0;
183  for (const char *c=c_str(); *c && strchr(" \n\r\t",*c); c++) {
184  n++;
185  }
186  _lead += n;
187  return n;
188  }
189  bool starts_with(const char *txt) const {
190  return !strncmp(c_str(),txt,strlen(txt));
191  }
192  bool ends_with(const char *txt) const {
193  size_t len = strlen(txt);
194  return (len <= length()) ? (!strcmp(c_str()+length()-len,txt)) : false;
195  }
196  char *data() { // direct access to data buffer
197  if (!_data) {
198  resize(0);
199  }
200  return _data+_lead;
201  }
202 private:
203  void init(size_t size) {
204  _used = 0;
205  _lead = 0;
206  _capacity = 0;
207  _data = NULL;
208  if (size) {
209  resize(size);
210  }
211  }
212  char * _data; // char buf
213  size_t _used; // characters used
214  size_t _lead; // for skipping whitespace
215  size_t _capacity; // max characters (always >_used)
216 };
217 
218 inline std::ostream& operator<<(std::ostream& os, const saxstring& s)
219 {
220  os << s.c_str();
221  return os;
222 }
223 
224 // fast string-to-value conversions
225 // not very boost-y, or even very c++, but lexical_cast and istringstreams are
226 // just too slow for our parsing performance needs.
227 template< typename Target > inline Target textToValue(const char *txt); // template prototype
228 
229 template<> inline float textToValue(const char *txt)
230 {
231  return (float) ATOF( txt ) ;
232 }
233 
234 template<> inline double textToValue(const char *txt)
235 {
236  return ATOF( txt );
237 }
238 
239 template<> inline int textToValue(const char *txt)
240 {
241  return atoi(txt);
242 }
243 
244 template<> inline char textToValue(const char *txt)
245 {
246  return *(txt);
247 }
248 
249 template<> inline long textToValue(const char *txt)
250 {
251  return atol(txt);
252 }
253 
254 template<> inline unsigned int textToValue(const char *txt)
255 {
256  return (unsigned int) strtoul( txt, NULL, 10 );
257 }
258 
259 template<> inline unsigned long textToValue(const char *txt)
260 {
261  return strtoul( txt, NULL, 10 );
262 }
263 
264 #if defined(BOOST_HAS_LONG_LONG)
265 
266 template<> inline long long textToValue(const char *txt)
267 {
268 #if defined(BOOST_HAS_MS_INT64)
269  return _atoi64(txt);
270 #else
271  return atoll(txt);
272 #endif
273 }
274 
275 template<> inline unsigned long long textToValue(const char *txt)
276 {
277 #if defined(BOOST_HAS_MS_INT64)
278  return _strtoui64(txt,NULL,10);
279 #else
280  return strtoull( txt, NULL, 10 );
281 #endif
282 }
283 
284 #endif // has long long
285 
286 inline bool istrue(const char *t)
287 {
288  return strcmp(t, "0") && strcmp(t,"false"); // as in optimized_lexical_cast.h
289 }
290 
291 template<> inline bool textToValue(const char *txt)
292 {
293  return istrue(txt);
294 }
295 
296 template<> inline boost::logic::tribool textToValue(const char *txt)
297 {
298  using namespace boost::logic;
299  if (!*txt)
300  return tribool(indeterminate);
301  else
302  {
303  bool b = istrue(txt);
304  return tribool(b);
305  }
306 }
307 
308 template<> inline std::string textToValue(const char *txt)
309 {
310  return std::string( txt );
311 }
312 
313 
314 /// SAX event handler interface.
315 class Handler
316 {
317  public:
318 
319  /// When false, no calls to characters() will be made
321 
322  /// Setting these to false will disable the auto-unescaping feature of the parser;
323  /// this is useful for handlers which deal with large amounts of data
325 
326  /// contextual version available to control handler logic which support multiple versions of a schema;
327  /// the default value 0 indicates handler should ignore the version;
328  /// the handler determines the meaning of any non-zero value
329  int version;
330 
331  /// Handler returns the Status struct as a means of changing the parser's behavior.
332  struct Status
333  {
334  enum Flag
335  {
336  Ok, // ok, continue parsing the stream
337  Done, // abort immediately
338  Delegate // delegate this element to the specified Handler [startElement() only]
339  };
340 
342  Handler* delegate; // valid iff (flag == Delegate)
343 
344  Status(Flag _flag = Ok,
345  Handler* _delegate = 0)
346  : flag(_flag), delegate(_delegate)
347  {}
348  };
349 
351  class Attributes
352  {
353  // lazy evaluation - doesn't process text until asked
354  // near-zero copy - copies the source text just once,
355  // instead of a bunch of little std::string operations
356  public:
357  Attributes(const char * _source_text, size_t _source_text_len, bool _autoUnescape) :
358  index(0),index_end(0),autoUnescape(_autoUnescape),firstread(true),attrs()
359  {
360  size=_source_text_len;
361  textbuff = (char *)malloc(size+1);
362  managemem = true;
363  memcpy(textbuff,_source_text,size);
364  textbuff[size] = 0;
365  setParserIndex(); // ready for eventual parsing
366  test_invariant(); // everything correct?
367  };
369  index(0),index_end(0),autoUnescape(false),firstread(true),attrs()
370  {
371  size=0;
372  textbuff = NULL;
373  managemem = true;
374  test_invariant(); // everything correct?
375  };
376  Attributes(saxstring &str, bool _autoUnescape) :
377  index(0),index_end(0),autoUnescape(_autoUnescape),firstread(true),attrs()
378  {
379  textbuff = str.data();
380  size=str.length();
381  managemem = false; // we don't have to free this
382  setParserIndex(); // ready for eventual parsing
383  test_invariant(); // everything correct?
384  };
386  {
387  if (managemem)
388  free(textbuff);
389  }
390  Attributes(const Attributes &rhs)
391  {
392  textbuff = NULL;
393  *this = rhs;
394  }
396  size = rhs.size;
397  index = rhs.index;
398  index_end = rhs.index_end; // string bounds for attribute parsing
399  autoUnescape = rhs.autoUnescape; // do XML escape of attribute?
400  firstread = rhs.firstread; // may change during const access
401  if (managemem)
402  textbuff = (char *)realloc(textbuff,size+1);
403  else
404  textbuff = (char *)malloc(size+1);
405  managemem = true; // we need to free textbuff at dtor
406  memcpy(textbuff,rhs.textbuff,size+1);
407  attrs.resize(rhs.attrs.size());
408  // now fix up the char ptrs to point to our copy of attribute list
409  for (size_t n=attrs.size();n--;)
410  {
411  attrs[n].name = ((char *)textbuff)+(rhs.attrs[n].getName()-rhs.getTextBuffer());
412  attrs[n].value = ((char *)textbuff)+(rhs.attrs[n].getValuePtr()-rhs.getTextBuffer());
413  }
414  test_invariant(); // everything correct?
415  return *this;
416  }
417 
418  inline void test_invariant() const
419  {
420 #ifdef _DEBUG
421  for (size_t n=attrs.size();n--;)
422  {
423  assert(textbuff != NULL);
424  assert(attrs[n].name>textbuff);
425  assert(attrs[n].value>attrs[n].name);
426  assert(attrs[n].value<textbuff+size);
427  if (n)
428  assert(attrs[n].name>attrs[n-1].value);
429  }
430 #endif
431  }
432 
433  const char *getTagName() const
434  { // work area contains tag name
435  test_invariant(); // everything correct?
436  return textbuff+('/'==*textbuff);
437  }
438  const char *getTextBuffer() const
439  { // return pointer to our work area
440  test_invariant(); // everything correct?
441  return textbuff;
442  }
443  size_t getSize() const
444  {
445  return size;
446  }
447  protected:
448  mutable char *textbuff; // we'll operate on this copy of string
449  size_t size;
450  mutable size_t index,index_end; // string bounds for attribute parsing
451  bool autoUnescape; // do XML escape of attribute?
452  bool managemem; // if true we need to free on exit
453  mutable bool firstread; // may change during const access
454 
456  {
457  // on entry, buffer has form "foo bar="baz" or maybe "foo/"
458  const char *c = textbuff;
459  while (*c && !strchr(" \n\r\t/",*c)) c++;
460  size_t indexNameEnd = c-textbuff;
461  while (*c && strchr(" \n\r\t",*c)) c++;
462  textbuff[indexNameEnd] = 0; // nullterm the name
463  index = c-textbuff; // should point to bar
464  index_end = size;
465  test_invariant(); // everything correct?
466  }
467  public:
468  class attribute
469  {
470  // a set of pointers into the main text buffer - going for zero copy, for speed
471  public:
472  attribute() {};
473  bool matchName(const char *test) const
474  {
475  return !strcmp(test,name); // return true on match
476  }
477  const char *getName() const
478  {
479  return name;
480  }
481 
482  // handle XML escapes on demand
483  const char *getValuePtr(XMLUnescapeBehavior_t Unescape = XMLUnescapeDefault) const
484  {
485  if (Unescape == NoXMLUnescape)
486  needsUnescape = false;
487  else if (needsUnescape) {
489  needsUnescape = false;
490  }
491  return value;
492  }
493  std::string getValue(XMLUnescapeBehavior_t Unescape = XMLUnescapeDefault) const {
494  return std::string(getValuePtr(Unescape));
495  }
496 
497  // cast-to-type
498  template< typename T >
499  inline T valueAs( XMLUnescapeBehavior_t Unescape ) const
500  {
501  return textToValue<T>(getValuePtr(Unescape));
502  }
503 
504  inline size_t valueAs( XMLUnescapeBehavior_t Unescape ) const
505  {
506  return (size_t)strtoul(getValuePtr(Unescape),NULL,10);
507  }
508 
509  friend class Attributes;
510  protected:
511  const char *name; // attribute name - a pointer into main text buffer
512  char *value; // also a pointer into main text buffer, content may change during read
513  mutable bool needsUnescape; // may change during read
514  void set(const char *_name, char *_value, bool _needsUnescape)
515  {
516  name = _name;
517  value = _value;
518  needsUnescape = _needsUnescape;
519  }
520  }; // class attribute
521 
522  public:
523  typedef std::vector<attribute> attribute_list;
524  protected:
525  mutable attribute_list attrs; // may change even in a const function due to lazy evaluation
526  public:
527  attribute_list::const_iterator begin() const
528  {
529  access(); // have we actually parsed the attributes text yet?
530  return attrs.begin();
531  }
532  attribute_list::const_iterator end() const
533  {
534  access(); // have we actually parsed the attributes text yet?
535  return attrs.end();
536  }
537  attribute_list::const_iterator find(const std::string &name) const
538  {
539  attribute_list::const_iterator it;
540  for (it = begin(); it != end() ; it++ )
541  {
542  if (it->matchName(name.c_str()))
543  break; // found it
544  }
545  return it;
546  }
547  protected:
548 
549  PWIZ_API_DECL void parseAttributes(std::string::size_type& index) const;
550 
551  void access() const
552  { // don't parse attributes until asked to
553  test_invariant(); // everything correct?
554  if (firstread) {
555  firstread = false;
557  }
558  test_invariant(); // everything correct?
559  }
560 
561  public:
562  const attribute *findAttributeByName(const char *name) const
563  {
564  access(); // parse the buffer if we haven't already
565  for (attribute_list::const_iterator it=attrs.begin();it!=attrs.end();it++)
566  {
567  if (it->matchName(name))
568  return &(*it);
569  }
570  return NULL;
571  }
572 
573  // return value for name if any, or NULL
574  const char *findValueByName(const char *name,XMLUnescapeBehavior_t Unescape = XMLUnescapeDefault) const
575  {
576  const attribute *attr = findAttributeByName(name);
577  if (attr)
578  return attr->getValuePtr(Unescape);
579  return NULL;
580  }
581 
582  };
583  typedef boost::iostreams::stream_offset stream_offset;
584 
585  virtual Status processingInstruction(const std::string& name,
586  const std::string& data,
588 
589  virtual Status startElement(const std::string& name,
590  const Attributes& attributes,
592 
593  virtual Status endElement(const std::string& name,
595 
598 
600  virtual ~Handler(){}
601 
602  protected:
603 
604  template <typename T>
605  inline T& getAttribute(const Attributes& attributes,
606  const char * name,
607  T& result,
608  XMLUnescapeBehavior_t Unescape,
609  T defaultValue = T()) const
610  {
611  const Attributes::attribute *attr = attributes.findAttributeByName(name);
612  if (attr)
613  result = attr->valueAs<T>(Unescape);
614  else
615  result = defaultValue;
616  return result;
617  }
618 
619  const char *getAttribute(const Attributes& attributes,
620  const char * name,
621  XMLUnescapeBehavior_t Unescape,
622  const char * defaultValue = NULL) const
623  {
624  const char *val = attributes.findValueByName(name,Unescape);
625  if (!val)
626  val = defaultValue;
627  return val;
628  }
629 
630 
631  // general case using default unescape behavior
632  template <typename T>
633  inline T& getAttribute(const Attributes& attributes,
634  const char *name,
635  T& result) const
636  {
637  const Attributes::attribute *attr = attributes.findAttributeByName(name);
638  if (attr)
639  result = attr->valueAs<T>(XMLUnescapeDefault);
640  else
641  result = T();
642  return result;
643  }
644 
645  inline std::string& getAttribute(const Attributes& attributes,
646  const char *name,
647  std::string& result) const
648  {
649  const Attributes::attribute *attr = attributes.findAttributeByName(name);
650  if (attr)
651  result = attr->getValuePtr(XMLUnescapeDefault);
652  else
653  result = "";
654  return result;
655  }
656 
657  // general case using default unescape behavior
658  template <typename T>
659  inline T& getAttribute(const Attributes& attributes,
660  const std::string &name,
661  T& result,
662  T defaultValue = T()) const
663  {
664  const Attributes::attribute *attr = attributes.findAttributeByName(name.c_str());
665  if (attr)
666  result = attr->valueAs<T>(XMLUnescapeDefault);
667  else
668  result = defaultValue;
669  return result;
670  }
671 };
672 
673 
674 ///
675 /// Extract a single XML element from the istream, sending SAX events to the handler.
676 ///
677 /// Behavior:
678 ///
679 /// - Parser returns when it completes reading of the first element it encounters.
680 ///
681 /// - Parser returns immediately if the Handler returns Status::Done when handling an event.
682 ///
683 /// - On startElement(), Handler may delegate handling to a sub-Handler, which will receive
684 /// the same startElement() event. The sub-Handler pointer will remain on the parser's
685 /// Handler stack until it handles the corresponding endElement(). Caution: The sub-Handler
686 /// pointer must remain valid while it is on the Handler stack, so it cannot point to
687 /// a local object that goes out of scope when Handler:startElement() returns.
688 ///
689 /// Notes:
690 /// - Start tags with end marker '/' generate two events, e.g. <br/> will generate events
691 /// startElement("br", ...) and endElement("br").
692 ///
693 PWIZ_API_DECL void parse(std::istream& is, Handler& handler);
694 
695 
696 } // namespace SAXParser
697 
698 
699 /// Returns the root element from an XML buffer;
700 /// throws runtime_error if no element is found.
701 PWIZ_API_DECL std::string xml_root_element(const std::string& fileheader);
702 
703 /// Returns the root element from an XML stream;
704 /// throws runtime_error if no element is found.
705 PWIZ_API_DECL std::string xml_root_element(std::istream& is);
706 
707 /// Returns the root element from an XML file;
708 /// throws runtime_error if no element is found.
709 PWIZ_API_DECL std::string xml_root_element_from_file(const std::string& filepath);
710 
711 
712 /// Decodes any characters encoded with their hexadecimal value,
713 /// e.g. "_x0020_" decodes as " "
714 /// This override modifies the input string in place and returns its reference.
715 PWIZ_API_DECL std::string& decode_xml_id(std::string& str);
716 
717 
718 /// Decodes any characters encoded with their hexadecimal value,
719 /// e.g. "_x0020_" decodes as " "
720 /// This override modifies and returns a copy of the input string.
721 PWIZ_API_DECL std::string decode_xml_id_copy(const std::string& str);
722 
723 
724 } // namespace minimxml
725 } // namespace pwiz
726 
727 
728 #endif // _SAXPARSER_HPP_
729 
730