Orcus
sax_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef ORCUS_SAX_PARSER_HPP
9 #define ORCUS_SAX_PARSER_HPP
10 
11 #include "sax_parser_base.hpp"
12 
13 namespace orcus {
14 
16 {
22  static const uint8_t baseline_version = 10;
23 };
24 
29 template<typename _Handler, typename _Config = sax_parser_default_config>
31 {
32 public:
33  typedef _Handler handler_type;
34  typedef _Config config_type;
35 
36  sax_parser(const char* content, const size_t size, handler_type& handler);
37  sax_parser(const char* content, const size_t size, bool transient_stream, handler_type& handler);
38  ~sax_parser();
39 
40  void parse();
41 
42 private:
43 
48  void header();
49  void body();
50  void element();
51  void element_open(std::ptrdiff_t begin_pos);
52  void element_close(std::ptrdiff_t begin_pos);
53  void special_tag();
54  void declaration(const char* name_check);
55  void cdata();
56  void doctype();
57  void characters();
58  void attribute();
59 
60 private:
61  handler_type& m_handler;
62 };
63 
64 template<typename _Handler, typename _Config>
66  const char* content, const size_t size, handler_type& handler) :
67  sax::parser_base(content, size, false),
68  m_handler(handler)
69 {
70 }
71 
72 template<typename _Handler, typename _Config>
73 sax_parser<_Handler,_Config>::sax_parser(
74  const char* content, const size_t size, bool transient_stream, handler_type& handler) :
75  sax::parser_base(content, size, transient_stream),
76  m_handler(handler)
77 {
78 }
79 
80 template<typename _Handler, typename _Config>
81 sax_parser<_Handler,_Config>::~sax_parser()
82 {
83 }
84 
85 template<typename _Handler, typename _Config>
86 void sax_parser<_Handler,_Config>::parse()
87 {
88  m_nest_level = 0;
89  mp_char = mp_begin;
90  header();
91  skip_space_and_control();
92  body();
93 
94  assert(m_buffer_pos == 0);
95 }
96 
97 template<typename _Handler, typename _Config>
98 void sax_parser<_Handler,_Config>::header()
99 {
100  // we don't handle multi byte encodings so we can just skip bom entry if exists.
101  skip_bom();
102  skip_space_and_control();
103  if (!has_char() || cur_char() != '<')
104  throw sax::malformed_xml_error("xml file must begin with '<'.", offset());
105 
106  if (config_type::baseline_version >= 11)
107  {
108  // XML version 1.1 requires a header declaration whereas in 1.0 it's
109  // optional.
110  if (next_char_checked() != '?')
111  throw sax::malformed_xml_error("xml file must begin with '<?'.", offset());
112 
113  declaration("xml");
114  }
115 }
116 
117 template<typename _Handler, typename _Config>
118 void sax_parser<_Handler,_Config>::body()
119 {
120  while (has_char())
121  {
122  if (cur_char() == '<')
123  {
124  element();
125  if (!m_root_elem_open)
126  // Root element closed. Stop parsing.
127  return;
128  }
129  else if (m_nest_level)
130  // Call characters only when in xml hierarchy.
131  characters();
132  else
133  next();
134  }
135 }
136 
137 template<typename _Handler, typename _Config>
138 void sax_parser<_Handler,_Config>::element()
139 {
140  assert(cur_char() == '<');
141  std::ptrdiff_t pos = offset();
142  char c = next_char_checked();
143  switch (c)
144  {
145  case '/':
146  element_close(pos);
147  break;
148  case '!':
149  special_tag();
150  break;
151  case '?':
152  declaration(nullptr);
153  break;
154  default:
155  if (!is_alpha(c) && c != '_')
156  throw sax::malformed_xml_error("expected an alphabet.", offset());
157  element_open(pos);
158  }
159 }
160 
161 template<typename _Handler, typename _Config>
162 void sax_parser<_Handler,_Config>::element_open(std::ptrdiff_t begin_pos)
163 {
164  assert(is_alpha(cur_char()) || cur_char() == '_');
165 
166  sax::parser_element elem;
167  element_name(elem, begin_pos);
168 
169  while (true)
170  {
171  skip_space_and_control();
172  char c = cur_char();
173  if (c == '/')
174  {
175  // Self-closing element: <element/>
176  if (next_and_char() != '>')
177  throw sax::malformed_xml_error("expected '/>' to self-close the element.", offset());
178  next();
179  elem.end_pos = offset();
180  m_handler.start_element(elem);
181  reset_buffer_pos();
182  m_handler.end_element(elem);
183  if (!m_nest_level)
184  m_root_elem_open = false;
185 #if ORCUS_DEBUG_SAX_PARSER
186  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
187 #endif
188  return;
189  }
190  else if (c == '>')
191  {
192  // End of opening element: <element>
193  next();
194  elem.end_pos = offset();
195  nest_up();
196  m_handler.start_element(elem);
197  reset_buffer_pos();
198 #if ORCUS_DEBUG_SAX_PARSER
199  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
200 #endif
201  return;
202  }
203  else
204  attribute();
205  }
206 }
207 
208 template<typename _Handler, typename _Config>
209 void sax_parser<_Handler,_Config>::element_close(std::ptrdiff_t begin_pos)
210 {
211  assert(cur_char() == '/');
212  nest_down();
213  next_check();
214  sax::parser_element elem;
215  element_name(elem, begin_pos);
216 
217  if (cur_char() != '>')
218  throw sax::malformed_xml_error("expected '>' to close the element.", offset());
219  next();
220  elem.end_pos = offset();
221 
222  m_handler.end_element(elem);
223 #if ORCUS_DEBUG_SAX_PARSER
224  cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
225 #endif
226  if (!m_nest_level)
227  m_root_elem_open = false;
228 }
229 
230 template<typename _Handler, typename _Config>
231 void sax_parser<_Handler,_Config>::special_tag()
232 {
233  assert(cur_char() == '!');
234  // This can be either <![CDATA, <!--, or <!DOCTYPE.
235  size_t len = remains();
236  if (len < 2)
237  throw sax::malformed_xml_error("special tag too short.", offset());
238 
239  switch (next_and_char())
240  {
241  case '-':
242  {
243  // Possibly comment.
244  if (next_and_char() != '-')
245  throw sax::malformed_xml_error("comment expected.", offset());
246 
247  len -= 2;
248  if (len < 3)
249  throw sax::malformed_xml_error("malformed comment.", offset());
250 
251  next();
252  comment();
253  }
254  break;
255  case '[':
256  {
257  // Possibly a CDATA.
258  expects_next("CDATA[", 6);
259  if (has_char())
260  cdata();
261  }
262  break;
263  case 'D':
264  {
265  // check if this is a DOCTYPE.
266  expects_next("OCTYPE", 6);
267  skip_space_and_control();
268  if (has_char())
269  doctype();
270  }
271  break;
272  default:
273  throw sax::malformed_xml_error("failed to parse special tag.", offset());
274  }
275 }
276 
277 template<typename _Handler, typename _Config>
278 void sax_parser<_Handler,_Config>::declaration(const char* name_check)
279 {
280  assert(cur_char() == '?');
281  next_check();
282 
283  // Get the declaration name first.
284  pstring decl_name;
285  name(decl_name);
286 #if ORCUS_DEBUG_SAX_PARSER
287  cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
288 #endif
289 
290  if (name_check && decl_name != name_check)
291  {
292  std::ostringstream os;
293  os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
294  throw sax::malformed_xml_error(os.str(), offset());
295  }
296 
297  m_handler.start_declaration(decl_name);
298  skip_space_and_control();
299 
300  // Parse the attributes.
301  while (cur_char_checked() != '?')
302  {
303  attribute();
304  skip_space_and_control();
305  }
306  if (next_char_checked() != '>')
307  throw sax::malformed_xml_error("declaration must end with '?>'.", offset());
308 
309  m_handler.end_declaration(decl_name);
310  reset_buffer_pos();
311  next();
312 #if ORCUS_DEBUG_SAX_PARSER
313  cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
314 #endif
315 }
316 
317 template<typename _Handler, typename _Config>
318 void sax_parser<_Handler,_Config>::cdata()
319 {
320  size_t len = remains();
321  assert(len > 3);
322 
323  // Parse until we reach ']]>'.
324  const char* p0 = mp_char;
325  size_t i = 0, match = 0;
326  for (char c = cur_char(); i < len; ++i, c = next_and_char())
327  {
328  if (c == ']')
329  {
330  // Be aware that we may encounter a series of more than two ']'
331  // characters, in which case we'll only count the last two.
332 
333  if (match == 0)
334  // First ']'
335  ++match;
336  else if (match == 1)
337  // Second ']'
338  ++match;
339  }
340  else if (c == '>' && match == 2)
341  {
342  // Found ']]>'.
343  size_t cdata_len = i - 2;
344  m_handler.characters(pstring(p0, cdata_len), transient_stream());
345  next();
346  return;
347  }
348  else
349  match = 0;
350  }
351  throw sax::malformed_xml_error("malformed CDATA section.", offset());
352 }
353 
354 template<typename _Handler, typename _Config>
355 void sax_parser<_Handler,_Config>::doctype()
356 {
357  // Parse the root element first.
358  sax::doctype_declaration param;
359  name(param.root_element);
360  skip_space_and_control();
361 
362  // Either PUBLIC or SYSTEM.
363  size_t len = remains();
364  if (len < 6)
365  throw sax::malformed_xml_error("DOCTYPE section too short.", offset());
366 
367  param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
368  char c = cur_char();
369  if (c == 'P')
370  {
371  if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
372  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
373 
374  param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
375  }
376  else if (c == 'S')
377  {
378  if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
379  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
380  }
381 
382  next_check();
383  skip_space_and_control();
384  has_char_throw("DOCTYPE section too short.");
385 
386  // Parse FPI.
387  value(param.fpi, false);
388 
389  has_char_throw("DOCTYPE section too short.");
390  skip_space_and_control();
391  has_char_throw("DOCTYPE section too short.");
392 
393  if (cur_char() == '>')
394  {
395  // Optional URI not given. Exit.
396 #if ORCUS_DEBUG_SAX_PARSER
397  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
398 #endif
399  m_handler.doctype(param);
400  next();
401  return;
402  }
403 
404  // Parse optional URI.
405  value(param.uri, false);
406 
407  has_char_throw("DOCTYPE section too short.");
408  skip_space_and_control();
409  has_char_throw("DOCTYPE section too short.");
410 
411  if (cur_char() != '>')
412  throw sax::malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
413 
414 #if ORCUS_DEBUG_SAX_PARSER
415  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
416 #endif
417  m_handler.doctype(param);
418  next();
419 }
420 
421 template<typename _Handler, typename _Config>
422 void sax_parser<_Handler,_Config>::characters()
423 {
424  const char* p0 = mp_char;
425  for (; has_char(); next())
426  {
427  if (cur_char() == '<')
428  break;
429 
430  if (cur_char() == '&')
431  {
432  // Text span with one or more encoded characters. Parse using cell buffer.
433  cell_buffer& buf = get_cell_buffer();
434  buf.reset();
435  buf.append(p0, mp_char-p0);
436  characters_with_encoded_char(buf);
437  if (buf.empty())
438  m_handler.characters(pstring(), transient_stream());
439  else
440  m_handler.characters(pstring(buf.get(), buf.size()), true);
441  return;
442  }
443  }
444 
445  if (mp_char > p0)
446  {
447  pstring val(p0, mp_char-p0);
448  m_handler.characters(val, transient_stream());
449  }
450 }
451 
452 template<typename _Handler, typename _Config>
453 void sax_parser<_Handler,_Config>::attribute()
454 {
455  sax::parser_attribute attr;
456  pstring attr_ns_name, attr_name, attr_value;
457  attribute_name(attr.ns, attr.name);
458 
459 #if ORCUS_DEBUG_SAX_PARSER
460  cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
461 #endif
462 
463  skip_space_and_control();
464 
465  char c = cur_char();
466  if (c != '=')
467  {
468  std::ostringstream os;
469  os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
470  throw sax::malformed_xml_error(os.str(), offset());
471  }
472 
473  next_check(); // skip the '='.
474  skip_space_and_control();
475 
476  attr.transient = value(attr.value, true);
477  if (attr.transient)
478  // Value is stored in a temporary buffer. Push a new buffer.
479  inc_buffer_pos();
480 
481 #if ORCUS_DEBUG_SAX_PARSER
482  cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
483 #endif
484 
485  m_handler.attribute(attr);
486 }
487 
488 }
489 
490 #endif
491 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
orcus::sax::parser_base
Definition: sax_parser_base.hpp:108
orcus::sax_parser
Definition: sax_parser.hpp:30
orcus::sax_parser_default_config::baseline_version
static const uint8_t baseline_version
Definition: sax_parser.hpp:22
orcus::sax_parser_default_config
Definition: sax_parser.hpp:15
orcus::parser_base
Definition: parser_base.hpp:39