Parser.js 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. var Tokenizer = require("./Tokenizer.js");
  2. /*
  3. Options:
  4. xmlMode: Disables the special behavior for script/style tags (false by default)
  5. lowerCaseAttributeNames: call .toLowerCase for each attribute name (true if xmlMode is `false`)
  6. lowerCaseTags: call .toLowerCase for each tag name (true if xmlMode is `false`)
  7. */
  8. /*
  9. Callbacks:
  10. oncdataend,
  11. oncdatastart,
  12. onclosetag,
  13. oncomment,
  14. oncommentend,
  15. onerror,
  16. onopentag,
  17. onprocessinginstruction,
  18. onreset,
  19. ontext
  20. */
  21. var formTags = {
  22. input: true,
  23. option: true,
  24. optgroup: true,
  25. select: true,
  26. button: true,
  27. datalist: true,
  28. textarea: true
  29. };
  30. var openImpliesClose = {
  31. tr: { tr: true, th: true, td: true },
  32. th: { th: true },
  33. td: { thead: true, th: true, td: true },
  34. body: { head: true, link: true, script: true },
  35. li: { li: true },
  36. p: { p: true },
  37. h1: { p: true },
  38. h2: { p: true },
  39. h3: { p: true },
  40. h4: { p: true },
  41. h5: { p: true },
  42. h6: { p: true },
  43. select: formTags,
  44. input: formTags,
  45. output: formTags,
  46. button: formTags,
  47. datalist: formTags,
  48. textarea: formTags,
  49. option: { option: true },
  50. optgroup: { optgroup: true }
  51. };
  52. var voidElements = {
  53. __proto__: null,
  54. area: true,
  55. base: true,
  56. basefont: true,
  57. br: true,
  58. col: true,
  59. command: true,
  60. embed: true,
  61. frame: true,
  62. hr: true,
  63. img: true,
  64. input: true,
  65. isindex: true,
  66. keygen: true,
  67. link: true,
  68. meta: true,
  69. param: true,
  70. source: true,
  71. track: true,
  72. wbr: true
  73. };
  74. var foreignContextElements = {
  75. __proto__: null,
  76. math: true,
  77. svg: true
  78. };
  79. var htmlIntegrationElements = {
  80. __proto__: null,
  81. mi: true,
  82. mo: true,
  83. mn: true,
  84. ms: true,
  85. mtext: true,
  86. "annotation-xml": true,
  87. foreignObject: true,
  88. desc: true,
  89. title: true
  90. };
  91. var re_nameEnd = /\s|\//;
  92. function Parser(cbs, options) {
  93. this._options = options || {};
  94. this._cbs = cbs || {};
  95. this._tagname = "";
  96. this._attribname = "";
  97. this._attribvalue = "";
  98. this._attribs = null;
  99. this._stack = [];
  100. this._foreignContext = [];
  101. this.startIndex = 0;
  102. this.endIndex = null;
  103. this._lowerCaseTagNames =
  104. "lowerCaseTags" in this._options
  105. ? !!this._options.lowerCaseTags
  106. : !this._options.xmlMode;
  107. this._lowerCaseAttributeNames =
  108. "lowerCaseAttributeNames" in this._options
  109. ? !!this._options.lowerCaseAttributeNames
  110. : !this._options.xmlMode;
  111. if (this._options.Tokenizer) {
  112. Tokenizer = this._options.Tokenizer;
  113. }
  114. this._tokenizer = new Tokenizer(this._options, this);
  115. if (this._cbs.onparserinit) this._cbs.onparserinit(this);
  116. }
  117. require("inherits")(Parser, require("events").EventEmitter);
  118. Parser.prototype._updatePosition = function(initialOffset) {
  119. if (this.endIndex === null) {
  120. if (this._tokenizer._sectionStart <= initialOffset) {
  121. this.startIndex = 0;
  122. } else {
  123. this.startIndex = this._tokenizer._sectionStart - initialOffset;
  124. }
  125. } else this.startIndex = this.endIndex + 1;
  126. this.endIndex = this._tokenizer.getAbsoluteIndex();
  127. };
  128. //Tokenizer event handlers
  129. Parser.prototype.ontext = function(data) {
  130. this._updatePosition(1);
  131. this.endIndex--;
  132. if (this._cbs.ontext) this._cbs.ontext(data);
  133. };
  134. Parser.prototype.onopentagname = function(name) {
  135. if (this._lowerCaseTagNames) {
  136. name = name.toLowerCase();
  137. }
  138. this._tagname = name;
  139. if (!this._options.xmlMode && name in openImpliesClose) {
  140. for (
  141. var el;
  142. (el = this._stack[this._stack.length - 1]) in
  143. openImpliesClose[name];
  144. this.onclosetag(el)
  145. );
  146. }
  147. if (this._options.xmlMode || !(name in voidElements)) {
  148. this._stack.push(name);
  149. if (name in foreignContextElements) this._foreignContext.push(true);
  150. else if (name in htmlIntegrationElements)
  151. this._foreignContext.push(false);
  152. }
  153. if (this._cbs.onopentagname) this._cbs.onopentagname(name);
  154. if (this._cbs.onopentag) this._attribs = {};
  155. };
  156. Parser.prototype.onopentagend = function() {
  157. this._updatePosition(1);
  158. if (this._attribs) {
  159. if (this._cbs.onopentag)
  160. this._cbs.onopentag(this._tagname, this._attribs);
  161. this._attribs = null;
  162. }
  163. if (
  164. !this._options.xmlMode &&
  165. this._cbs.onclosetag &&
  166. this._tagname in voidElements
  167. ) {
  168. this._cbs.onclosetag(this._tagname);
  169. }
  170. this._tagname = "";
  171. };
  172. Parser.prototype.onclosetag = function(name) {
  173. this._updatePosition(1);
  174. if (this._lowerCaseTagNames) {
  175. name = name.toLowerCase();
  176. }
  177. if (name in foreignContextElements || name in htmlIntegrationElements) {
  178. this._foreignContext.pop();
  179. }
  180. if (
  181. this._stack.length &&
  182. (!(name in voidElements) || this._options.xmlMode)
  183. ) {
  184. var pos = this._stack.lastIndexOf(name);
  185. if (pos !== -1) {
  186. if (this._cbs.onclosetag) {
  187. pos = this._stack.length - pos;
  188. while (pos--) this._cbs.onclosetag(this._stack.pop());
  189. } else this._stack.length = pos;
  190. } else if (name === "p" && !this._options.xmlMode) {
  191. this.onopentagname(name);
  192. this._closeCurrentTag();
  193. }
  194. } else if (!this._options.xmlMode && (name === "br" || name === "p")) {
  195. this.onopentagname(name);
  196. this._closeCurrentTag();
  197. }
  198. };
  199. Parser.prototype.onselfclosingtag = function() {
  200. if (
  201. this._options.xmlMode ||
  202. this._options.recognizeSelfClosing ||
  203. this._foreignContext[this._foreignContext.length - 1]
  204. ) {
  205. this._closeCurrentTag();
  206. } else {
  207. this.onopentagend();
  208. }
  209. };
  210. Parser.prototype._closeCurrentTag = function() {
  211. var name = this._tagname;
  212. this.onopentagend();
  213. //self-closing tags will be on the top of the stack
  214. //(cheaper check than in onclosetag)
  215. if (this._stack[this._stack.length - 1] === name) {
  216. if (this._cbs.onclosetag) {
  217. this._cbs.onclosetag(name);
  218. }
  219. this._stack.pop();
  220. }
  221. };
  222. Parser.prototype.onattribname = function(name) {
  223. if (this._lowerCaseAttributeNames) {
  224. name = name.toLowerCase();
  225. }
  226. this._attribname = name;
  227. };
  228. Parser.prototype.onattribdata = function(value) {
  229. this._attribvalue += value;
  230. };
  231. Parser.prototype.onattribend = function() {
  232. if (this._cbs.onattribute)
  233. this._cbs.onattribute(this._attribname, this._attribvalue);
  234. if (
  235. this._attribs &&
  236. !Object.prototype.hasOwnProperty.call(this._attribs, this._attribname)
  237. ) {
  238. this._attribs[this._attribname] = this._attribvalue;
  239. }
  240. this._attribname = "";
  241. this._attribvalue = "";
  242. };
  243. Parser.prototype._getInstructionName = function(value) {
  244. var idx = value.search(re_nameEnd),
  245. name = idx < 0 ? value : value.substr(0, idx);
  246. if (this._lowerCaseTagNames) {
  247. name = name.toLowerCase();
  248. }
  249. return name;
  250. };
  251. Parser.prototype.ondeclaration = function(value) {
  252. if (this._cbs.onprocessinginstruction) {
  253. var name = this._getInstructionName(value);
  254. this._cbs.onprocessinginstruction("!" + name, "!" + value);
  255. }
  256. };
  257. Parser.prototype.onprocessinginstruction = function(value) {
  258. if (this._cbs.onprocessinginstruction) {
  259. var name = this._getInstructionName(value);
  260. this._cbs.onprocessinginstruction("?" + name, "?" + value);
  261. }
  262. };
  263. Parser.prototype.oncomment = function(value) {
  264. this._updatePosition(4);
  265. if (this._cbs.oncomment) this._cbs.oncomment(value);
  266. if (this._cbs.oncommentend) this._cbs.oncommentend();
  267. };
  268. Parser.prototype.oncdata = function(value) {
  269. this._updatePosition(1);
  270. if (this._options.xmlMode || this._options.recognizeCDATA) {
  271. if (this._cbs.oncdatastart) this._cbs.oncdatastart();
  272. if (this._cbs.ontext) this._cbs.ontext(value);
  273. if (this._cbs.oncdataend) this._cbs.oncdataend();
  274. } else {
  275. this.oncomment("[CDATA[" + value + "]]");
  276. }
  277. };
  278. Parser.prototype.onerror = function(err) {
  279. if (this._cbs.onerror) this._cbs.onerror(err);
  280. };
  281. Parser.prototype.onend = function() {
  282. if (this._cbs.onclosetag) {
  283. for (
  284. var i = this._stack.length;
  285. i > 0;
  286. this._cbs.onclosetag(this._stack[--i])
  287. );
  288. }
  289. if (this._cbs.onend) this._cbs.onend();
  290. };
  291. //Resets the parser to a blank state, ready to parse a new HTML document
  292. Parser.prototype.reset = function() {
  293. if (this._cbs.onreset) this._cbs.onreset();
  294. this._tokenizer.reset();
  295. this._tagname = "";
  296. this._attribname = "";
  297. this._attribs = null;
  298. this._stack = [];
  299. if (this._cbs.onparserinit) this._cbs.onparserinit(this);
  300. };
  301. //Parses a complete HTML document and pushes it to the handler
  302. Parser.prototype.parseComplete = function(data) {
  303. this.reset();
  304. this.end(data);
  305. };
  306. Parser.prototype.write = function(chunk) {
  307. this._tokenizer.write(chunk);
  308. };
  309. Parser.prototype.end = function(chunk) {
  310. this._tokenizer.end(chunk);
  311. };
  312. Parser.prototype.pause = function() {
  313. this._tokenizer.pause();
  314. };
  315. Parser.prototype.resume = function() {
  316. this._tokenizer.resume();
  317. };
  318. //alias for backwards compat
  319. Parser.prototype.parseChunk = Parser.prototype.write;
  320. Parser.prototype.done = Parser.prototype.end;
  321. module.exports = Parser;