index.js 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. 'use strict';
  2. var TransformStream = require('stream').Transform,
  3. DevNullStream = require('./dev_null_stream'),
  4. inherits = require('util').inherits,
  5. Tokenizer = require('../tokenizer'),
  6. LocationInfoTokenizerMixin = require('../extensions/location_info/tokenizer_mixin'),
  7. ParserFeedbackSimulator = require('./parser_feedback_simulator'),
  8. mergeOptions = require('../utils/merge_options');
  9. var DEFAULT_OPTIONS = {
  10. locationInfo: false
  11. };
  12. var SAXParser = module.exports = function (options) {
  13. TransformStream.call(this);
  14. this.options = mergeOptions(DEFAULT_OPTIONS, options);
  15. this.tokenizer = new Tokenizer(options);
  16. if (this.options.locationInfo)
  17. new LocationInfoTokenizerMixin(this.tokenizer);
  18. this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.tokenizer);
  19. this.pendingText = null;
  20. this.currentTokenLocation = void 0;
  21. this.lastChunkWritten = false;
  22. this.stopped = false;
  23. // NOTE: always pipe stream to the /dev/null stream to avoid
  24. // `highWaterMark` hit even if we don't have consumers.
  25. // (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774)
  26. this.pipe(new DevNullStream());
  27. };
  28. inherits(SAXParser, TransformStream);
  29. //TransformStream implementation
  30. SAXParser.prototype._transform = function (chunk, encoding, callback) {
  31. if (!this.stopped) {
  32. this.tokenizer.write(chunk.toString('utf8'), this.lastChunkWritten);
  33. this._runParsingLoop();
  34. }
  35. this.push(chunk);
  36. callback();
  37. };
  38. SAXParser.prototype._flush = function (callback) {
  39. callback();
  40. };
  41. SAXParser.prototype.end = function (chunk, encoding, callback) {
  42. this.lastChunkWritten = true;
  43. TransformStream.prototype.end.call(this, chunk, encoding, callback);
  44. };
  45. SAXParser.prototype.stop = function () {
  46. this.stopped = true;
  47. };
  48. //Internals
  49. SAXParser.prototype._runParsingLoop = function () {
  50. do {
  51. var token = this.parserFeedbackSimulator.getNextToken();
  52. if (token.type === Tokenizer.HIBERNATION_TOKEN)
  53. break;
  54. if (token.type === Tokenizer.CHARACTER_TOKEN ||
  55. token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN ||
  56. token.type === Tokenizer.NULL_CHARACTER_TOKEN) {
  57. if (this.options.locationInfo) {
  58. if (this.pendingText === null)
  59. this.currentTokenLocation = token.location;
  60. else
  61. this.currentTokenLocation.endOffset = token.location.endOffset;
  62. }
  63. this.pendingText = (this.pendingText || '') + token.chars;
  64. }
  65. else {
  66. this._emitPendingText();
  67. this._handleToken(token);
  68. }
  69. } while (!this.stopped && token.type !== Tokenizer.EOF_TOKEN);
  70. };
  71. SAXParser.prototype._handleToken = function (token) {
  72. if (this.options.locationInfo)
  73. this.currentTokenLocation = token.location;
  74. if (token.type === Tokenizer.START_TAG_TOKEN)
  75. this.emit('startTag', token.tagName, token.attrs, token.selfClosing, this.currentTokenLocation);
  76. else if (token.type === Tokenizer.END_TAG_TOKEN)
  77. this.emit('endTag', token.tagName, this.currentTokenLocation);
  78. else if (token.type === Tokenizer.COMMENT_TOKEN)
  79. this.emit('comment', token.data, this.currentTokenLocation);
  80. else if (token.type === Tokenizer.DOCTYPE_TOKEN)
  81. this.emit('doctype', token.name, token.publicId, token.systemId, this.currentTokenLocation);
  82. };
  83. SAXParser.prototype._emitPendingText = function () {
  84. if (this.pendingText !== null) {
  85. this.emit('text', this.pendingText, this.currentTokenLocation);
  86. this.pendingText = null;
  87. }
  88. };