saxes.js 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065
  1. "use strict";
  2. const { isS, isChar, isNameStartChar, isNameChar, S_LIST, NAME_RE } =
  3. require("xmlchars/xml/1.0/ed5");
  4. const { isNCNameStartChar, isNCNameChar, NC_NAME_RE } = require("xmlchars/xmlns/1.0/ed3");
  5. const XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace";
  6. const XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/";
  7. const rootNS = {
  8. __proto__: null,
  9. xml: XML_NAMESPACE,
  10. xmlns: XMLNS_NAMESPACE,
  11. };
  12. const XML_ENTITIES = {
  13. __proto__: null,
  14. amp: "&",
  15. gt: ">",
  16. lt: "<",
  17. quot: "\"",
  18. apos: "'",
  19. };
  20. const S_INITIAL = "sInitial"; // initial state
  21. const S_BEGIN_WHITESPACE = "sBeginWhitespace"; // leading whitespace
  22. const S_DOCTYPE = "sDoctype"; // <!DOCTYPE
  23. const S_DOCTYPE_QUOTE = "sDoctypeQuote"; // <!DOCTYPE "//blah
  24. const S_DTD = "sDTD"; // <!DOCTYPE "//blah" [ ...
  25. const S_DTD_QUOTED = "sDTDQuoted"; // <!DOCTYPE "//blah" [ "foo
  26. const S_DTD_OPEN_WAKA = "sDTDOpenWaka";
  27. const S_DTD_OPEN_WAKA_BANG = "sDTDOpenWakaBang";
  28. const S_DTD_COMMENT = "sDTDComment"; // <!--
  29. const S_DTD_COMMENT_ENDING = "sDTDCommentEnding"; // <!-- blah -
  30. const S_DTD_COMMENT_ENDED = "sDTDCommentEnded"; // <!-- blah --
  31. const S_DTD_PI = "sDTDPI"; // <?
  32. const S_DTD_PI_ENDING = "sDTDPIEnding"; // <?hi "there" ?
  33. const S_TEXT = "sText"; // general stuff
  34. const S_ENTITY = "sEntity"; // &amp and such
  35. const S_OPEN_WAKA = "sOpenWaka"; // <
  36. const S_OPEN_WAKA_BANG = "sOpenWakaBang"; // <!...
  37. const S_COMMENT = "sComment"; // <!--
  38. const S_COMMENT_ENDING = "sCommentEnding"; // <!-- blah -
  39. const S_COMMENT_ENDED = "sCommentEnded"; // <!-- blah --
  40. const S_CDATA = "sCData"; // <![CDATA[ something
  41. const S_CDATA_ENDING = "sCDataEnding"; // ]
  42. const S_CDATA_ENDING_2 = "sCDataEnding2"; // ]]
  43. const S_PI_FIRST_CHAR = "sPIFirstChar"; // <?hi, first char
  44. const S_PI_REST = "sPIRest"; // <?hi, rest of the name
  45. const S_PI_BODY = "sPIBody"; // <?hi there
  46. const S_PI_ENDING = "sPIEnding"; // <?hi "there" ?
  47. const S_OPEN_TAG = "sOpenTag"; // <strong
  48. const S_OPEN_TAG_SLASH = "sOpenTagSlash"; // <strong /
  49. const S_ATTRIB = "sAttrib"; // <a
  50. const S_ATTRIB_NAME = "sAttribName"; // <a foo
  51. const S_ATTRIB_NAME_SAW_WHITE = "sAttribNameSawWhite"; // <a foo _
  52. const S_ATTRIB_VALUE = "sAttribValue"; // <a foo=
  53. const S_ATTRIB_VALUE_QUOTED = "sAttribValueQuoted"; // <a foo="bar
  54. const S_ATTRIB_VALUE_CLOSED = "sAttribValueClosed"; // <a foo="bar"
  55. const S_ATTRIB_VALUE_UNQUOTED = "sAttribValueUnquoted"; // <a foo=bar
  56. const S_CLOSE_TAG = "sCloseTag"; // </a
  57. const S_CLOSE_TAG_SAW_WHITE = "sCloseTagSawWhite"; // </a >
  58. // These states are internal to sPIBody
  59. const S_XML_DECL_NAME_START = 1; // <?xml
  60. const S_XML_DECL_NAME = 2; // <?xml foo
  61. const S_XML_DECL_EQ = 3; // <?xml foo=
  62. const S_XML_DECL_VALUE_START = 4; // <?xml foo=
  63. const S_XML_DECL_VALUE = 5; // <?xml foo="bar"
  64. /**
  65. * The list of supported events.
  66. */
  67. exports.EVENTS = [
  68. "text",
  69. "processinginstruction",
  70. "doctype",
  71. "comment",
  72. "opentagstart",
  73. "opentag",
  74. "closetag",
  75. "cdata",
  76. "error",
  77. "end",
  78. "ready",
  79. ];
  80. const NL = 0xA;
  81. const CR = 0xD;
  82. const SPACE = 0x20;
  83. const BANG = 0x21;
  84. const DQUOTE = 0x22;
  85. const AMP = 0x26;
  86. const SQUOTE = 0x27;
  87. const MINUS = 0x2D;
  88. const FORWARD_SLASH = 0x2F;
  89. const SEMICOLON = 0x3B;
  90. const LESS = 0x3C;
  91. const EQUAL = 0x3D;
  92. const GREATER = 0x3E;
  93. const QUESTION = 0x3F;
  94. const OPEN_BRACKET = 0x5B;
  95. const CLOSE_BRACKET = 0x5D;
  96. function isQuote(c) {
  97. return c === DQUOTE || c === SQUOTE;
  98. }
  99. const QUOTES = [DQUOTE, SQUOTE];
  100. const DOCTYPE_TERMINATOR = [...QUOTES, OPEN_BRACKET, GREATER];
  101. const DTD_TERMINATOR = [...QUOTES, LESS, CLOSE_BRACKET];
  102. const XML_DECL_NAME_TERMINATOR = [EQUAL, QUESTION, ...S_LIST];
  103. const ATTRIB_VALUE_UNQUOTED_TERMINATOR = [...S_LIST, GREATER, AMP, LESS];
  104. function nsPairCheck(parser, prefix, uri) {
  105. switch (prefix) {
  106. case "xml":
  107. if (uri !== XML_NAMESPACE) {
  108. parser.fail(`xml prefix must be bound to ${XML_NAMESPACE}.`);
  109. }
  110. break;
  111. case "xmlns":
  112. if (uri !== XMLNS_NAMESPACE) {
  113. parser.fail(`xmlns prefix must be bound to ${XMLNS_NAMESPACE}.`);
  114. }
  115. break;
  116. default:
  117. }
  118. switch (uri) {
  119. case XMLNS_NAMESPACE:
  120. parser.fail(prefix === "" ?
  121. `the default namespace may not be set to ${uri}.` :
  122. `may not assign a prefix (even "xmlns") to the URI \
  123. ${XMLNS_NAMESPACE}.`);
  124. break;
  125. case XML_NAMESPACE:
  126. switch (prefix) {
  127. case "xml":
  128. // Assinging the XML namespace to "xml" is fine.
  129. break;
  130. case "":
  131. parser.fail(`the default namespace may not be set to ${uri}.`);
  132. break;
  133. default:
  134. parser.fail("may not assign the xml namespace to another prefix.");
  135. }
  136. break;
  137. default:
  138. }
  139. }
  140. function nsMappingCheck(parser, mapping) {
  141. for (const local of Object.keys(mapping)) {
  142. nsPairCheck(parser, local, mapping[local]);
  143. }
  144. }
  145. function isNCName(name) {
  146. return NC_NAME_RE.test(name);
  147. }
  148. function isName(name) {
  149. return NAME_RE.test(name);
  150. }
  151. const FORBIDDEN_START = 0;
  152. const FORBIDDEN_BRACKET = 1;
  153. const FORBIDDEN_BRACKET_BRACKET = 2;
  154. /**
  155. * Data structure for an XML tag.
  156. *
  157. * @typedef {object} SaxesTag
  158. *
  159. * @property {string} name The tag's name. This is the combination of prefix and
  160. * global name. For instance ``<a:b>`` would have ``"a:b"`` for ``name``.
  161. *
  162. * @property {string} prefix The tag's prefix. For instance ``<a:b>`` would have
  163. * ``"a"`` for ``prefix``. Undefined if we do not track namespaces.
  164. *
  165. * @property {string} local The tag's local name. For instance ``<a:b>`` would
  166. * have ``"b"`` for ``local``. Undefined if we do not track namespaces.
  167. *
  168. * @property {string} uri The namespace URI of this tag. Undefined if we do not
  169. * track namespaces.
  170. *
  171. * @property {Object.<string, SaxesAttribute> | Object.<string, string>}
  172. * attributes A map of attribute name to attributes. If namespaces are tracked,
  173. * the values in the map are {@link SaxesAttribute SaxesAttribute}
  174. * objects. Otherwise, they are strings.
  175. *
  176. * @property {Object.<string, string>} ns The namespace bindings in effect.
  177. *
  178. * @property {boolean} isSelfClosing Whether the tag is
  179. * self-closing (e.g. ``<foo/>``).
  180. *
  181. */
  182. /**
  183. * Data structure for an XML attribute
  184. *
  185. * @typedef {object} SaxesAttribute
  186. *
  187. * @property {string} name The attribute's name. This is the combination of
  188. * prefix and local name. For instance ``a:b="c"`` would have ``a:b`` for name.
  189. *
  190. * @property {string} prefix The attribute's prefix. For instance ``a:b="c"``
  191. * would have ``"a"`` for ``prefix``.
  192. *
  193. * @property {string} local The attribute's local name. For instance ``a:b="c"``
  194. * would have ``"b"`` for ``local``.
  195. *
  196. * @property {string} uri The namespace URI of this attribute.
  197. *
  198. * @property {string} value The attribute's value.
  199. */
  200. /**
  201. * @typedef XMLDecl
  202. *
  203. * @property {string} [version] The version specified by the XML declaration.
  204. *
  205. * @property {string} [encoding] The encoding specified by the XML declaration.
  206. *
  207. * @property {string} [standalone] The value of the standalone parameter
  208. * specified by the XML declaration.
  209. */
  210. /**
  211. * @callback ResolvePrefix
  212. *
  213. * @param {string} prefix The prefix to check.
  214. *
  215. * @returns {string|undefined} The URI corresponding to the prefix, if any.
  216. */
  217. /**
  218. * @typedef SaxesOptions
  219. *
  220. * @property {boolean} [xmlns] Whether to track namespaces. Unset means
  221. * ``false``.
  222. *
  223. * @property {boolean} [fragment] Whether to accept XML fragments. Unset means
  224. * ``false``.
  225. *
  226. * @property {boolean} [additionalNamespaces] A plain object whose key, value
  227. * pairs define namespaces known before parsing the XML file. It is not legal
  228. * to pass bindings for the namespaces ``"xml"`` or ``"xmlns"``.
  229. *
  230. * @property {ResolvePrefix} [resolvePrefix] A function that will be used if the
  231. * parser cannot resolve a namespace prefix on its own.
  232. *
  233. * @property {boolean} [position] Whether to track positions. Unset means
  234. * ``true``.
  235. *
  236. * @property {string} [fileName] A file name to use for error reporting. Leaving
  237. * this unset will report a file name of "undefined". "File name" is a loose
  238. * concept. You could use a URL to some resource, or any descriptive name you
  239. * like.
  240. */
  241. class SaxesParser {
  242. /**
  243. * @param {SaxesOptions} opt The parser options.
  244. */
  245. constructor(opt) {
  246. this._init(opt);
  247. }
  248. /**
  249. * Reset the parser state.
  250. *
  251. * @private
  252. */
  253. _init(opt) {
  254. this.comment = "";
  255. this.openWakaBang = "";
  256. this.text = "";
  257. this.name = "";
  258. this.doctype = "";
  259. this.piTarget = "";
  260. this.piBody = "";
  261. this.entity = "";
  262. this.cdata = "";
  263. this.xmlDeclName = "";
  264. this.xmlDeclValue = "";
  265. /**
  266. * The options passed to the constructor of this parser.
  267. *
  268. * @type {SaxesOptions}
  269. */
  270. this.opt = opt || {};
  271. /**
  272. * Indicates whether or not the parser is closed. If ``true``, wait for
  273. * the ``ready`` event to write again.
  274. *
  275. * @type {boolean}
  276. */
  277. this.closed = false;
  278. /**
  279. * The XML declaration for this document.
  280. *
  281. * @type {XMLDecl}
  282. */
  283. this.xmlDecl = {
  284. version: undefined,
  285. encoding: undefined,
  286. standalone: undefined,
  287. };
  288. this.q = null;
  289. this.tags = [];
  290. this.tag = null;
  291. this.chunk = "";
  292. this.chunkPosition = 0;
  293. this.i = 0;
  294. this.trailingCR = false;
  295. this.forbiddenState = FORBIDDEN_START;
  296. /**
  297. * A map of entity name to expansion.
  298. *
  299. * @type {Object.<string, string>}
  300. */
  301. this.ENTITIES = Object.create(XML_ENTITIES);
  302. this.attribList = [];
  303. // The logic is organized so as to minimize the need to check
  304. // this.opt.fragment while parsing.
  305. const fragmentOpt = this.fragmentOpt = !!this.opt.fragment;
  306. this.state = fragmentOpt ? S_TEXT : S_INITIAL;
  307. // We want these to be all true if we are dealing with a fragment.
  308. this.reportedTextBeforeRoot = this.reportedTextAfterRoot = this.closedRoot =
  309. this.sawRoot = fragmentOpt;
  310. // An XML declaration is intially possible only when parsing whole
  311. // documents.
  312. this.xmlDeclPossible = !fragmentOpt;
  313. this.piIsXMLDecl = false;
  314. this.xmlDeclState = S_XML_DECL_NAME_START;
  315. this.xmlDeclExpects = ["version"];
  316. this.requiredSeparator = false;
  317. this.entityReturnState = undefined;
  318. const xmlnsOpt = this.xmlnsOpt = !!this.opt.xmlns;
  319. if (xmlnsOpt) {
  320. // This is the function we use to perform name checks on PIs and entities.
  321. // When namespaces are used, colons are not allowed in PI target names or
  322. // entity names. So the check depends on whether namespaces are used. See:
  323. //
  324. // https://www.w3.org/XML/xml-names-19990114-errata.html
  325. // NE08
  326. //
  327. this.nameStartCheck = isNCNameStartChar;
  328. this.nameCheck = isNCNameChar;
  329. this.isName = isNCName;
  330. this.processAttribs = this.processAttribsNS;
  331. this.pushAttrib = this.pushAttribNS;
  332. this.ns = Object.assign({ __proto__: null }, rootNS);
  333. const additional = this.opt.additionalNamespaces;
  334. if (additional) {
  335. nsMappingCheck(this, additional);
  336. Object.assign(this.ns, additional);
  337. }
  338. }
  339. else {
  340. this.nameStartCheck = isNameStartChar;
  341. this.nameCheck = isNameChar;
  342. this.isName = isName;
  343. this.processAttribs = this.processAttribsPlain;
  344. this.pushAttrib = this.pushAttribPlain;
  345. }
  346. this.trackPosition = this.opt.position !== false;
  347. /** The line number the parser is currently looking at. */
  348. this.line = 1;
  349. /** The column the parser is currently looking at. */
  350. this.column = 0;
  351. this.fileName = this.opt.fileName;
  352. this.onready();
  353. }
  354. /** The stream position the parser is currently looking at. */
  355. get position() {
  356. return this.chunkPosition + this.i;
  357. }
  358. /* eslint-disable class-methods-use-this */
  359. /**
  360. * Event handler for text data. The default implementation is a no-op.
  361. *
  362. * @param {string} text The text data encountered by the parser.
  363. *
  364. */
  365. ontext() {}
  366. /**
  367. * Event handler for processing instructions. The default implementation is a
  368. * no-op.
  369. *
  370. * @param {{target: string, body: string}} data The target and body of
  371. * the processing instruction.
  372. */
  373. onprocessinginstruction() {}
  374. /**
  375. * Event handler for doctype. The default implementation is a no-op.
  376. *
  377. * @param {string} doctype The doctype contents.
  378. */
  379. ondoctype() {}
  380. /**
  381. * Event handler for comments. The default implementation is a no-op.
  382. *
  383. * @param {string} comment The comment contents.
  384. */
  385. oncomment() {}
  386. /**
  387. * Event handler for the start of an open tag. This is called as soon as we
  388. * have a tag name. The default implementation is a no-op.
  389. *
  390. * @param {SaxesTag} tag The tag.
  391. */
  392. onopentagstart() {}
  393. /**
  394. * Event handler for an open tag. This is called when the open tag is
  395. * complete. (We've encountered the ">" that ends the open tag.) The default
  396. * implementation is a no-op.
  397. *
  398. * @param {SaxesTag} tag The tag.
  399. */
  400. onopentag() {}
  401. /**
  402. * Event handler for a close tag. Note that for self-closing tags, this is
  403. * called right after ``onopentag``. The default implementation is a no-op.
  404. *
  405. * @param {SaxesTag} tag The tag.
  406. */
  407. onclosetag() {}
  408. /**
  409. * Event handler for a CDATA section. This is called when ending the
  410. * CDATA section. The default implementation is a no-op.
  411. *
  412. * @param {string} cdata The contents of the CDATA section.
  413. */
  414. oncdata() {}
  415. /**
  416. * Event handler for the stream end. This is called when the stream has been
  417. * closed with ``close`` or by passing ``null`` to ``write``. The default
  418. * implementation is a no-op.
  419. */
  420. onend() {}
  421. /**
  422. * Event handler indicating parser readiness . This is called when the parser
  423. * is ready to parse a new document. The default implementation is a no-op.
  424. */
  425. onready() {}
  426. /**
  427. * Event handler indicating an error. The default implementation throws the
  428. * error. Override with a no-op handler if you don't want this.
  429. *
  430. * @param {Error} err The error that occurred.
  431. */
  432. onerror(err) {
  433. throw new Error(err);
  434. }
  435. /* eslint-enable class-methods-use-this */
  436. /**
  437. * Report a parsing error. This method is made public so that client code may
  438. * check for issues that are outside the scope of this project and can report
  439. * errors.
  440. *
  441. * @param {Error} er The error to report.
  442. *
  443. * @returns this
  444. */
  445. fail(er) {
  446. const message = (this.trackPosition) ?
  447. `${this.fileName}:${this.line}:${this.column}: ${er}` : er;
  448. this.onerror(new Error(message));
  449. return this;
  450. }
  451. /**
  452. * Write a XML data to the parser.
  453. *
  454. * @param {string} chunk The XML data to write.
  455. *
  456. * @returns this
  457. */
  458. write(chunk) {
  459. if (this.closed) {
  460. return this.fail("cannot write after close; assign an onready handler.");
  461. }
  462. let end = false;
  463. if (chunk === null) {
  464. end = true;
  465. chunk = "";
  466. }
  467. if (typeof chunk === "object") {
  468. chunk = chunk.toString();
  469. }
  470. // We checked if performing a pre-decomposition of the string into an array
  471. // of single complete characters (``Array.from(chunk)``) would be faster
  472. // than the current repeated calls to ``codePointAt``. As of August 2018, it
  473. // isn't. (There may be Node-specific code that would perform faster than
  474. // ``Array.from`` but don't want to be dependent on Node.)
  475. let limit = chunk.length;
  476. if (this.trailingCR) {
  477. // The previous chunk had a trailing cr. We need to handle it now.
  478. chunk = `\r${chunk}`;
  479. }
  480. if (!end && chunk[limit - 1] === CR) {
  481. // The chunk ends with a trailing CR. We cannot know how to handle it
  482. // until we get the next chunk or the end of the stream. So save it for
  483. // later.
  484. limit--;
  485. this.trailingCR = true;
  486. }
  487. this.limit = limit;
  488. this.chunk = chunk;
  489. this.i = 0;
  490. while (this.i < limit) {
  491. this[this.state]();
  492. }
  493. this.chunkPosition += limit;
  494. return end ? this.end() : this;
  495. }
  496. /**
  497. * Close the current stream. Perform final well-formedness checks and reset
  498. * the parser tstate.
  499. *
  500. * @returns this
  501. */
  502. close() {
  503. return this.write(null);
  504. }
  505. /**
  506. * Get a single code point out of the current chunk. This updates the current
  507. * position if we do position tracking.
  508. *
  509. * @private
  510. *
  511. * @returns {number} The character read.
  512. */
  513. getCode() {
  514. const { chunk, i } = this;
  515. // Using charCodeAt and handling the surrogates ourselves is faster
  516. // than using codePointAt.
  517. let code = chunk.charCodeAt(i);
  518. let skip = 1;
  519. switch (code) {
  520. case CR:
  521. // We may get NaN if we read past the end of the chunk, which is
  522. // fine.
  523. if (chunk.charCodeAt(i + 1) === NL) {
  524. // A \r\n sequence is converted to \n so we have to skip over the next
  525. // character. We already know it has a size of 1 so ++ is fine here.
  526. skip++;
  527. }
  528. // Otherwise, a \r is just converted to \n, so we don't have to skip
  529. // ahead.
  530. // In either case, \r becomes \n.
  531. code = NL;
  532. /* yes, fall through */
  533. case NL:
  534. this.line++;
  535. this.column = 0;
  536. break;
  537. default:
  538. this.column++;
  539. if (code >= 0xD800 && code <= 0xDBFF) {
  540. code = 0x10000 + ((code - 0xD800) * 0x400) +
  541. (chunk.charCodeAt(i + 1) - 0xDC00);
  542. this.column++;
  543. skip++;
  544. }
  545. if (!isChar(code)) {
  546. this.fail("disallowed character.");
  547. }
  548. }
  549. this.i += skip;
  550. return code;
  551. }
  552. /**
  553. * @callback CharacterTest
  554. *
  555. * @private
  556. *
  557. * @param {string} c The character to test.
  558. *
  559. * @returns {boolean} ``true`` if the method should continue capturing text,
  560. * ``false`` otherwise.
  561. */
  562. /**
  563. * Capture characters into a buffer until encountering one of a set of
  564. * characters.
  565. *
  566. * @private
  567. *
  568. * @param {number[]} chars An array of codepoints. Encountering a character in
  569. * the array ends the capture.
  570. *
  571. * @param {string} buffer The name of the buffer to save into.
  572. *
  573. * @return {number|undefined} The character code that made the capture end, or
  574. * ``undefined`` if we hit the end of the chunk.
  575. */
  576. captureTo(chars, buffer) {
  577. const { chunk, limit, i: start } = this;
  578. while (this.i < limit) {
  579. const c = this.getCode();
  580. if (chars.includes(c)) {
  581. // This is faster than adding codepoints one by one.
  582. this[buffer] += chunk.substring(start,
  583. this.i - (c <= 0xFFFF ? 1 : 2));
  584. return c;
  585. }
  586. }
  587. // This is faster than adding codepoints one by one.
  588. this[buffer] += chunk.substring(start);
  589. return undefined;
  590. }
  591. /**
  592. * Capture characters into a buffer until encountering a character.
  593. *
  594. * @private
  595. *
  596. * @param {number} char The codepoint that ends the capture.
  597. *
  598. * @param {string} buffer The name of the buffer to save into.
  599. *
  600. * @return {boolean} ``true`` if we ran into the character. Otherwise, we ran
  601. * into the end of the current chunk.
  602. */
  603. captureToChar(char, buffer) {
  604. const { chunk, limit, i: start } = this;
  605. while (this.i < limit) {
  606. const c = this.getCode();
  607. if (c === char) {
  608. // This is faster than adding codepoints one by one.
  609. this[buffer] += chunk.substring(start,
  610. this.i - (c <= 0xFFFF ? 1 : 2));
  611. return true;
  612. }
  613. }
  614. // This is faster than adding codepoints one by one.
  615. this[buffer] += chunk.substring(start);
  616. return false;
  617. }
  618. /**
  619. * Capture characters that satisfy ``isNameChar`` into the ``name`` field of
  620. * this parser.
  621. *
  622. * @private
  623. *
  624. * @return {number|undefined} The character code that made the test fail, or
  625. * ``undefined`` if we hit the end of the chunk.
  626. */
  627. captureNameChars() {
  628. const { chunk, limit, i: start } = this;
  629. while (this.i < limit) {
  630. const c = this.getCode();
  631. if (!isNameChar(c)) {
  632. // This is faster than adding codepoints one by one.
  633. this.name += chunk.substring(start,
  634. this.i - (c <= 0xFFFF ? 1 : 2));
  635. return c;
  636. }
  637. }
  638. // This is faster than adding codepoints one by one.
  639. this.name += chunk.substring(start);
  640. return undefined;
  641. }
  642. /**
  643. * Capture characters into a buffer while ``this.nameCheck`` run on the
  644. * character read returns true.
  645. *
  646. * @private
  647. *
  648. * @param {string} buffer The name of the buffer to save into.
  649. *
  650. * @return {number|undefined} The character code that made the test fail, or
  651. * ``undefined`` if we hit the end of the chunk.
  652. */
  653. captureWhileNameCheck(buffer) {
  654. const { chunk, limit, i: start } = this;
  655. while (this.i < limit) {
  656. const c = this.getCode();
  657. if (!this.nameCheck(c)) {
  658. // This is faster than adding codepoints one by one.
  659. this[buffer] += chunk.substring(start,
  660. this.i - (c <= 0xFFFF ? 1 : 2));
  661. return c;
  662. }
  663. }
  664. // This is faster than adding codepoints one by one.
  665. this[buffer] += chunk.substring(start);
  666. return undefined;
  667. }
  668. /**
  669. * Skip white spaces.
  670. *
  671. * @private
  672. *
  673. * @return {string|undefined} The character that ended the skip, or
  674. * ``undefined`` if we hit the end of the chunk.
  675. */
  676. skipSpaces() {
  677. const { limit } = this;
  678. while (this.i < limit) {
  679. const c = this.getCode();
  680. if (!isS(c)) {
  681. return c;
  682. }
  683. }
  684. return undefined;
  685. }
  686. // STATE HANDLERS
  687. /** @private */
  688. sInitial() {
  689. // We are essentially peeking at the first character of the chunk. Since
  690. // S_INITIAL can be in effect only when we start working on the first chunk,
  691. // the index at which we must look is necessarily 0. Note also that the
  692. // following tests do not depend on decoding surrogates.
  693. const c = this.chunk.charCodeAt(0);
  694. // If the initial character is 0xFEFF, ignore it.
  695. if (c === 0xFEFF) {
  696. this.i++;
  697. this.column++;
  698. }
  699. else if (isS(c)) {
  700. this.i++;
  701. this.column++;
  702. // An XML declaration cannot appear after initial spaces.
  703. this.xmlDeclPossible = false;
  704. }
  705. this.state = S_BEGIN_WHITESPACE;
  706. }
  707. /** @private */
  708. sBeginWhitespace() {
  709. const c = this.skipSpaces();
  710. if (c === LESS) {
  711. this.state = S_OPEN_WAKA;
  712. }
  713. else if (c) {
  714. // have to process this as a text node.
  715. // weird, but happens.
  716. if (!this.reportedTextBeforeRoot) {
  717. this.fail("text data outside of root node.");
  718. this.reportedTextBeforeRoot = true;
  719. }
  720. this.text = String.fromCodePoint(c);
  721. this.state = S_TEXT;
  722. this.xmlDeclPossible = false;
  723. }
  724. }
  725. /** @private */
  726. sText() {
  727. //
  728. // We did try a version of saxes where the S_TEXT state was split in two
  729. // states: one for text inside the root element, and one for text
  730. // outside. This was avoiding having to test this.tags.length to decide what
  731. // implementation to actually use.
  732. //
  733. // Peformance testing on gigabyte-size files did not show any advantage to
  734. // using the two states solution instead of the current one. Conversely, it
  735. // made the code a bit more complicated elsewhere. For instance, a comment
  736. // can appear before the root element so when a comment ended it was
  737. // necessary to determine whether to return to the S_TEXT state or to the
  738. // new text-outside-root state.
  739. //
  740. if (this.tags.length !== 0) {
  741. this.handleTextInRoot();
  742. }
  743. else {
  744. this.handleTextOutsideRoot();
  745. }
  746. }
  747. /** @private */
  748. handleTextInRoot() {
  749. // This is essentially a specialized version of captureTo which is optimized
  750. // for performing the ]]> check. A previous version of this code, checked
  751. // ``this.text`` for the presence of ]]>. It simplified the code but was
  752. // very costly when character data contained a lot of entities to be parsed.
  753. //
  754. // Since we are using a specialized loop, we also keep track of the presence
  755. // of ]]> in text data. The sequence ]]> is forbidden to appear as-is.
  756. //
  757. const { chunk, limit, i: start } = this;
  758. let { forbiddenState } = this;
  759. let c;
  760. // eslint-disable-next-line no-labels, no-restricted-syntax
  761. scanLoop:
  762. while (this.i < limit) {
  763. const code = this.getCode();
  764. switch (code) {
  765. case LESS:
  766. this.state = S_OPEN_WAKA;
  767. c = code;
  768. forbiddenState = FORBIDDEN_START;
  769. // eslint-disable-next-line no-labels
  770. break scanLoop;
  771. case AMP:
  772. this.state = S_ENTITY;
  773. this.entityReturnState = S_TEXT;
  774. c = code;
  775. forbiddenState = FORBIDDEN_START;
  776. // eslint-disable-next-line no-labels
  777. break scanLoop;
  778. case CLOSE_BRACKET:
  779. switch (forbiddenState) {
  780. case FORBIDDEN_START:
  781. forbiddenState = FORBIDDEN_BRACKET;
  782. break;
  783. case FORBIDDEN_BRACKET:
  784. forbiddenState = FORBIDDEN_BRACKET_BRACKET;
  785. break;
  786. case FORBIDDEN_BRACKET_BRACKET:
  787. break;
  788. default:
  789. throw new Error("impossible state");
  790. }
  791. break;
  792. case GREATER:
  793. if (forbiddenState === FORBIDDEN_BRACKET_BRACKET) {
  794. this.fail("the string \"]]>\" is disallowed in char data.");
  795. }
  796. forbiddenState = FORBIDDEN_START;
  797. break;
  798. default:
  799. forbiddenState = FORBIDDEN_START;
  800. }
  801. }
  802. this.forbiddenState = forbiddenState;
  803. // This is faster than adding codepoints one by one.
  804. this.text += chunk.substring(start,
  805. c === undefined ? undefined :
  806. (this.i - (c <= 0xFFFF ? 1 : 2)));
  807. }
  808. /** @private */
  809. handleTextOutsideRoot() {
  810. // This is essentially a specialized version of captureTo which is optimized
  811. // for performing the ]]> check. A previous version of this code, checked
  812. // ``this.text`` for the presence of ]]>. It simplified the code but was
  813. // very costly when character data contained a lot of entities to be parsed.
  814. //
  815. // Since we are using a specialized loop, we also keep track of the presence
  816. // of non-space characters in the text since these are errors when appearing
  817. // outside the document root element.
  818. //
  819. const { chunk, limit, i: start } = this;
  820. let nonSpace = false;
  821. let c;
  822. // eslint-disable-next-line no-labels, no-restricted-syntax
  823. outRootLoop:
  824. while (this.i < limit) {
  825. const code = this.getCode();
  826. switch (code) {
  827. case LESS:
  828. this.state = S_OPEN_WAKA;
  829. c = code;
  830. // eslint-disable-next-line no-labels
  831. break outRootLoop;
  832. case AMP:
  833. this.state = S_ENTITY;
  834. this.entityReturnState = S_TEXT;
  835. c = code;
  836. nonSpace = true;
  837. // eslint-disable-next-line no-labels
  838. break outRootLoop;
  839. default:
  840. if (!isS(code)) {
  841. nonSpace = true;
  842. }
  843. }
  844. }
  845. // This is faster than adding codepoints one by one.
  846. this.text += chunk.substring(start,
  847. c === undefined ? undefined :
  848. (this.i - (c <= 0xFFFF ? 1 : 2)));
  849. if (!nonSpace) {
  850. return;
  851. }
  852. // We use the reportedTextBeforeRoot and reportedTextAfterRoot flags
  853. // to avoid reporting errors for every single character that is out of
  854. // place.
  855. if (!this.sawRoot && !this.reportedTextBeforeRoot) {
  856. this.fail("text data outside of root node.");
  857. this.reportedTextBeforeRoot = true;
  858. }
  859. if (this.closedRoot && !this.reportedTextAfterRoot) {
  860. this.fail("text data outside of root node.");
  861. this.reportedTextAfterRoot = true;
  862. }
  863. }
  864. /** @private */
  865. sOpenWaka() {
  866. const c = this.getCode();
  867. // either a /, ?, !, or text is coming next.
  868. if (isNameStartChar(c)) {
  869. this.state = S_OPEN_TAG;
  870. this.name = String.fromCodePoint(c);
  871. this.xmlDeclPossible = false;
  872. }
  873. else {
  874. switch (c) {
  875. case FORWARD_SLASH:
  876. this.state = S_CLOSE_TAG;
  877. this.xmlDeclPossible = false;
  878. break;
  879. case BANG:
  880. this.state = S_OPEN_WAKA_BANG;
  881. this.openWakaBang = "";
  882. this.xmlDeclPossible = false;
  883. break;
  884. case QUESTION:
  885. this.state = S_PI_FIRST_CHAR;
  886. break;
  887. default:
  888. this.fail("disallowed character in tag name.");
  889. this.state = S_TEXT;
  890. this.xmlDeclPossible = false;
  891. }
  892. }
  893. }
  894. /** @private */
  895. sOpenWakaBang() {
  896. this.openWakaBang += String.fromCodePoint(this.getCode());
  897. switch (this.openWakaBang) {
  898. case "[CDATA[":
  899. if (!this.sawRoot && !this.reportedTextBeforeRoot) {
  900. this.fail("text data outside of root node.");
  901. this.reportedTextBeforeRoot = true;
  902. }
  903. if (this.closedRoot && !this.reportedTextAfterRoot) {
  904. this.fail("text data outside of root node.");
  905. this.reportedTextAfterRoot = true;
  906. }
  907. this.state = S_CDATA;
  908. this.openWakaBang = "";
  909. break;
  910. case "--":
  911. this.state = S_COMMENT;
  912. this.openWakaBang = "";
  913. break;
  914. case "DOCTYPE":
  915. this.state = S_DOCTYPE;
  916. if (this.doctype || this.sawRoot) {
  917. this.fail("inappropriately located doctype declaration.");
  918. }
  919. this.openWakaBang = "";
  920. break;
  921. default:
  922. // 7 happens to be the maximum length of the string that can possibly
  923. // match one of the cases above.
  924. if (this.openWakaBang.length >= 7) {
  925. this.fail("incorrect syntax.");
  926. }
  927. }
  928. }
  929. /** @private */
  930. sDoctype() {
  931. const c = this.captureTo(DOCTYPE_TERMINATOR, "doctype");
  932. if (c === GREATER) {
  933. this.state = S_TEXT;
  934. if (this.text.length !== 0) {
  935. this.closeText();
  936. }
  937. this.ondoctype(this.doctype);
  938. this.doctype = true; // just remember that we saw it.
  939. }
  940. else if (c) {
  941. this.doctype += String.fromCodePoint(c);
  942. if (c === OPEN_BRACKET) {
  943. this.state = S_DTD;
  944. }
  945. else if (isQuote(c)) {
  946. this.state = S_DOCTYPE_QUOTE;
  947. this.q = c;
  948. }
  949. }
  950. }
  951. /** @private */
  952. sDoctypeQuote() {
  953. const { q } = this;
  954. if (this.captureToChar(q, "doctype")) {
  955. this.doctype += String.fromCodePoint(q);
  956. this.q = null;
  957. this.state = S_DOCTYPE;
  958. }
  959. }
  960. /** @private */
  961. sDTD() {
  962. const c = this.captureTo(DTD_TERMINATOR, "doctype");
  963. if (!c) {
  964. return;
  965. }
  966. this.doctype += String.fromCodePoint(c);
  967. if (c === CLOSE_BRACKET) {
  968. this.state = S_DOCTYPE;
  969. }
  970. else if (c === LESS) {
  971. this.state = S_DTD_OPEN_WAKA;
  972. }
  973. else if (isQuote(c)) {
  974. this.state = S_DTD_QUOTED;
  975. this.q = c;
  976. }
  977. }
  978. /** @private */
  979. sDTDQuoted() {
  980. const { q } = this;
  981. if (this.captureToChar(q, "doctype")) {
  982. this.doctype += String.fromCodePoint(q);
  983. this.state = S_DTD;
  984. this.q = null;
  985. }
  986. }
  987. /** @private */
  988. sDTDOpenWaka() {
  989. const c = this.getCode();
  990. this.doctype += String.fromCodePoint(c);
  991. switch (c) {
  992. case BANG:
  993. this.state = S_DTD_OPEN_WAKA_BANG;
  994. this.openWakaBang = "";
  995. break;
  996. case QUESTION:
  997. this.state = S_DTD_PI;
  998. break;
  999. default:
  1000. this.state = S_DTD;
  1001. }
  1002. }
  1003. /** @private */
  1004. sDTDOpenWakaBang() {
  1005. const char = String.fromCodePoint(this.getCode());
  1006. const owb = this.openWakaBang += char;
  1007. this.doctype += char;
  1008. if (owb !== "-") {
  1009. this.state = owb === "--" ? S_DTD_COMMENT : S_DTD;
  1010. this.openWakaBang = "";
  1011. }
  1012. }
  1013. /** @private */
  1014. sDTDComment() {
  1015. if (this.captureToChar(MINUS, "doctype")) {
  1016. this.doctype += "-";
  1017. this.state = S_DTD_COMMENT_ENDING;
  1018. }
  1019. }
  1020. /** @private */
  1021. sDTDCommentEnding() {
  1022. const c = this.getCode();
  1023. this.doctype += String.fromCodePoint(c);
  1024. this.state = c === MINUS ? S_DTD_COMMENT_ENDED : S_DTD_COMMENT;
  1025. }
  1026. /** @private */
  1027. sDTDCommentEnded() {
  1028. const c = this.getCode();
  1029. this.doctype += String.fromCodePoint(c);
  1030. if (c === GREATER) {
  1031. this.state = S_DTD;
  1032. }
  1033. else {
  1034. this.fail("malformed comment.");
  1035. // <!-- blah -- bloo --> will be recorded as
  1036. // a comment of " blah -- bloo "
  1037. this.state = S_DTD_COMMENT;
  1038. }
  1039. }
  1040. /** @private */
  1041. sDTDPI() {
  1042. if (this.captureToChar(QUESTION, "doctype")) {
  1043. this.doctype += "?";
  1044. this.state = S_DTD_PI_ENDING;
  1045. }
  1046. }
  1047. /** @private */
  1048. sDTDPIEnding() {
  1049. const c = this.getCode();
  1050. this.doctype += String.fromCodePoint(c);
  1051. if (c === GREATER) {
  1052. this.state = S_DTD;
  1053. }
  1054. }
  1055. /** @private */
  1056. sComment() {
  1057. if (this.captureToChar(MINUS, "comment")) {
  1058. this.state = S_COMMENT_ENDING;
  1059. }
  1060. }
  1061. /** @private */
  1062. sCommentEnding() {
  1063. const c = this.getCode();
  1064. if (c === MINUS) {
  1065. this.state = S_COMMENT_ENDED;
  1066. if (this.text.length !== 0) {
  1067. this.closeText();
  1068. }
  1069. this.oncomment(this.comment);
  1070. this.comment = "";
  1071. }
  1072. else {
  1073. this.comment += `-${String.fromCodePoint(c)}`;
  1074. this.state = S_COMMENT;
  1075. }
  1076. }
  1077. /** @private */
  1078. sCommentEnded() {
  1079. const c = this.getCode();
  1080. if (c !== GREATER) {
  1081. this.fail("malformed comment.");
  1082. // <!-- blah -- bloo --> will be recorded as
  1083. // a comment of " blah -- bloo "
  1084. this.comment += `--${String.fromCodePoint(c)}`;
  1085. this.state = S_COMMENT;
  1086. }
  1087. else {
  1088. this.state = S_TEXT;
  1089. }
  1090. }
  1091. /** @private */
  1092. sCData() {
  1093. if (this.captureToChar(CLOSE_BRACKET, "cdata")) {
  1094. this.state = S_CDATA_ENDING;
  1095. }
  1096. }
  1097. /** @private */
  1098. sCDataEnding() {
  1099. const c = this.getCode();
  1100. if (c === CLOSE_BRACKET) {
  1101. this.state = S_CDATA_ENDING_2;
  1102. }
  1103. else {
  1104. this.cdata += `]${String.fromCodePoint(c)}`;
  1105. this.state = S_CDATA;
  1106. }
  1107. }
  1108. /** @private */
  1109. sCDataEnding2() {
  1110. const c = this.getCode();
  1111. switch (c) {
  1112. case GREATER:
  1113. if (this.text.length !== 0) {
  1114. this.closeText();
  1115. }
  1116. this.oncdata(this.cdata);
  1117. this.cdata = "";
  1118. this.state = S_TEXT;
  1119. break;
  1120. case CLOSE_BRACKET:
  1121. this.cdata += "]";
  1122. break;
  1123. default:
  1124. this.cdata += `]]${String.fromCodePoint(c)}`;
  1125. this.state = S_CDATA;
  1126. }
  1127. }
  1128. /** @private */
  1129. sPIFirstChar() {
  1130. const c = this.getCode();
  1131. if (this.nameStartCheck(c)) {
  1132. this.piTarget += String.fromCodePoint(c);
  1133. this.state = S_PI_REST;
  1134. }
  1135. else if (c === QUESTION || isS(c)) {
  1136. this.fail("processing instruction without a target.");
  1137. this.state = c === QUESTION ? S_PI_ENDING : S_PI_BODY;
  1138. }
  1139. else {
  1140. this.fail("disallowed character in processing instruction name.");
  1141. this.piTarget += String.fromCodePoint(c);
  1142. this.state = S_PI_REST;
  1143. }
  1144. }
  1145. /** @private */
  1146. sPIRest() {
  1147. const c = this.captureWhileNameCheck("piTarget");
  1148. if ((c === QUESTION || isS(c))) {
  1149. this.piIsXMLDecl = this.piTarget === "xml";
  1150. if (this.piIsXMLDecl && !this.xmlDeclPossible) {
  1151. this.fail("an XML declaration must be at the start of the document.");
  1152. }
  1153. this.state = c === QUESTION ? S_PI_ENDING : S_PI_BODY;
  1154. }
  1155. else if (c) {
  1156. this.fail("disallowed character in processing instruction name.");
  1157. this.piTarget += String.fromCodePoint(c);
  1158. }
  1159. }
  1160. /** @private */
  1161. sPIBody() {
  1162. let c;
  1163. if (this.piIsXMLDecl) {
  1164. switch (this.xmlDeclState) {
  1165. case S_XML_DECL_NAME_START: {
  1166. c = this.getCode();
  1167. if (isS(c)) {
  1168. c = this.skipSpaces();
  1169. }
  1170. else if (this.requiredSeparator && c !== QUESTION) {
  1171. this.fail("whitespace required.");
  1172. }
  1173. this.requiredSeparator = false;
  1174. // The question mark character is not valid inside any of the XML
  1175. // declaration name/value pairs.
  1176. if (c === QUESTION) {
  1177. this.state = S_PI_ENDING;
  1178. return;
  1179. }
  1180. if (c) {
  1181. this.xmlDeclState = S_XML_DECL_NAME;
  1182. this.xmlDeclName = String.fromCodePoint(c);
  1183. }
  1184. break;
  1185. }
  1186. case S_XML_DECL_NAME:
  1187. c = this.captureTo(XML_DECL_NAME_TERMINATOR, "xmlDeclName");
  1188. // The question mark character is not valid inside any of the XML
  1189. // declaration name/value pairs.
  1190. if (c === QUESTION) {
  1191. this.state = S_PI_ENDING;
  1192. return;
  1193. }
  1194. if (isS(c) || c === EQUAL) {
  1195. if (!this.xmlDeclExpects.includes(this.xmlDeclName)) {
  1196. switch (this.xmlDeclName.length) {
  1197. case 0:
  1198. this.fail("did not expect any more name/value pairs.");
  1199. break;
  1200. case 1:
  1201. this.fail(`expected the name ${this.xmlDeclExpects[0]}.`);
  1202. break;
  1203. default:
  1204. this.fail(`expected one of ${this.xmlDeclExpects.join(", ")}`);
  1205. }
  1206. }
  1207. this.xmlDeclState = (c === EQUAL) ? S_XML_DECL_VALUE_START :
  1208. S_XML_DECL_EQ;
  1209. }
  1210. break;
  1211. case S_XML_DECL_EQ:
  1212. c = this.getCode();
  1213. // The question mark character is not valid inside any of the XML
  1214. // declaration name/value pairs.
  1215. if (c === QUESTION) {
  1216. this.state = S_PI_ENDING;
  1217. return;
  1218. }
  1219. if (!isS(c)) {
  1220. if (c !== EQUAL) {
  1221. this.fail("value required.");
  1222. }
  1223. this.xmlDeclState = S_XML_DECL_VALUE_START;
  1224. }
  1225. break;
  1226. case S_XML_DECL_VALUE_START:
  1227. c = this.getCode();
  1228. // The question mark character is not valid inside any of the XML
  1229. // declaration name/value pairs.
  1230. if (c === QUESTION) {
  1231. this.state = S_PI_ENDING;
  1232. return;
  1233. }
  1234. if (!isS(c)) {
  1235. if (!isQuote(c)) {
  1236. this.fail("value must be quoted.");
  1237. this.q = SPACE;
  1238. }
  1239. else {
  1240. this.q = c;
  1241. }
  1242. this.xmlDeclState = S_XML_DECL_VALUE;
  1243. }
  1244. break;
  1245. case S_XML_DECL_VALUE:
  1246. c = this.captureTo([this.q, QUESTION], "xmlDeclValue");
  1247. // The question mark character is not valid inside any of the XML
  1248. // declaration name/value pairs.
  1249. if (c === QUESTION) {
  1250. this.state = S_PI_ENDING;
  1251. return;
  1252. }
  1253. if (c) {
  1254. switch (this.xmlDeclName) {
  1255. case "version":
  1256. if (!/^1\.[0-9]+$/.test(this.xmlDeclValue)) {
  1257. this.fail("version number must match /^1\\.[0-9]+$/.");
  1258. }
  1259. this.xmlDeclExpects = ["encoding", "standalone"];
  1260. this.xmlDecl.version = this.xmlDeclValue;
  1261. break;
  1262. case "encoding":
  1263. if (!/^[A-Za-z][A-Za-z0-9._-]*$/.test(this.xmlDeclValue)) {
  1264. this.fail("encoding value must match \
  1265. /^[A-Za-z0-9][A-Za-z0-9._-]*$/.");
  1266. }
  1267. this.xmlDeclExpects = ["standalone"];
  1268. this.xmlDecl.encoding = this.xmlDeclValue;
  1269. break;
  1270. case "standalone":
  1271. if (this.xmlDeclValue !== "yes" && this.xmlDeclValue !== "no") {
  1272. this.fail("standalone value must match \"yes\" or \"no\".");
  1273. }
  1274. this.xmlDeclExpects = [];
  1275. this.xmlDecl.standalone = this.xmlDeclValue;
  1276. break;
  1277. default:
  1278. // We don't need to raise an error here since we've already
  1279. // raised one when checking what name was expected.
  1280. }
  1281. this.xmlDeclName = this.xmlDeclValue = "";
  1282. this.xmlDeclState = S_XML_DECL_NAME_START;
  1283. this.requiredSeparator = true;
  1284. }
  1285. break;
  1286. default:
  1287. throw new Error(this,
  1288. `Unknown XML declaration state: ${this.xmlDeclState}`);
  1289. }
  1290. }
  1291. else if (this.piBody.length === 0) {
  1292. c = this.getCode();
  1293. if (c === QUESTION) {
  1294. this.state = S_PI_ENDING;
  1295. }
  1296. else if (!isS(c)) {
  1297. this.piBody = String.fromCodePoint(c);
  1298. }
  1299. }
  1300. // The question mark character is not valid inside any of the XML
  1301. // declaration name/value pairs.
  1302. else if (this.captureToChar(QUESTION, "piBody")) {
  1303. this.state = S_PI_ENDING;
  1304. }
  1305. }
  1306. /** @private */
  1307. sPIEnding() {
  1308. const c = this.getCode();
  1309. if (this.piIsXMLDecl) {
  1310. if (c === GREATER) {
  1311. if (this.piTarget !== "xml") {
  1312. this.fail("processing instructions are not allowed before root.");
  1313. }
  1314. else if (this.xmlDeclState !== S_XML_DECL_NAME_START) {
  1315. this.fail("XML declaration is incomplete.");
  1316. }
  1317. else if (this.xmlDeclExpects.includes("version")) {
  1318. this.fail("XML declaration must contain a version.");
  1319. }
  1320. this.xmlDeclName = this.xmlDeclValue = "";
  1321. this.requiredSeparator = false;
  1322. this.piTarget = this.piBody = "";
  1323. this.state = S_TEXT;
  1324. }
  1325. else {
  1326. // We got here because the previous character was a ?, but the
  1327. // question mark character is not valid inside any of the XML
  1328. // declaration name/value pairs.
  1329. this.fail(
  1330. "The character ? is disallowed anywhere in XML declarations.");
  1331. }
  1332. }
  1333. else if (c === GREATER) {
  1334. if (this.piTarget.trim().toLowerCase() === "xml") {
  1335. this.fail("the XML declaration must appear at the start of the document.");
  1336. }
  1337. if (this.text.length !== 0) {
  1338. this.closeText();
  1339. }
  1340. this.onprocessinginstruction({
  1341. target: this.piTarget,
  1342. body: this.piBody,
  1343. });
  1344. this.piTarget = this.piBody = "";
  1345. this.state = S_TEXT;
  1346. }
  1347. else if (c === QUESTION) {
  1348. // We ran into ?? as part of a processing instruction. We initially
  1349. // took the first ? as a sign that the PI was ending, but it is
  1350. // not. So we have to add it to the body but we take the new ? as a
  1351. // sign that the PI is ending.
  1352. this.piBody += "?";
  1353. }
  1354. else {
  1355. this.piBody += `?${String.fromCodePoint(c)}`;
  1356. this.state = S_PI_BODY;
  1357. }
  1358. this.xmlDeclPossible = false;
  1359. }
  1360. /** @private */
  1361. sOpenTag() {
  1362. const c = this.captureNameChars();
  1363. if (!c) {
  1364. return;
  1365. }
  1366. const tag = this.tag = {
  1367. name: this.name,
  1368. attributes: Object.create(null),
  1369. };
  1370. if (this.xmlnsOpt) {
  1371. tag.ns = Object.create(null);
  1372. }
  1373. if (this.text.length !== 0) {
  1374. this.closeText();
  1375. }
  1376. this.onopentagstart(tag);
  1377. this.sawRoot = true;
  1378. if (!this.fragmentOpt && this.closedRoot) {
  1379. this.fail("documents may contain only one root.");
  1380. }
  1381. switch (c) {
  1382. case GREATER:
  1383. this.openTag();
  1384. break;
  1385. case FORWARD_SLASH:
  1386. this.state = S_OPEN_TAG_SLASH;
  1387. break;
  1388. default:
  1389. if (!isS(c)) {
  1390. this.fail("disallowed character in tag name.");
  1391. }
  1392. this.state = S_ATTRIB;
  1393. }
  1394. }
  1395. /** @private */
  1396. sOpenTagSlash() {
  1397. const c = this.getCode();
  1398. if (c === GREATER) {
  1399. this.openSelfClosingTag();
  1400. }
  1401. else {
  1402. this.fail("forward-slash in opening tag not followed by >.");
  1403. this.state = S_ATTRIB;
  1404. }
  1405. }
  1406. /** @private */
  1407. sAttrib() {
  1408. const c = this.skipSpaces();
  1409. if (!c) {
  1410. return;
  1411. }
  1412. if (isNameStartChar(c)) {
  1413. this.name = String.fromCodePoint(c);
  1414. this.state = S_ATTRIB_NAME;
  1415. }
  1416. else if (c === GREATER) {
  1417. this.openTag();
  1418. }
  1419. else if (c === FORWARD_SLASH) {
  1420. this.state = S_OPEN_TAG_SLASH;
  1421. }
  1422. else {
  1423. this.fail("disallowed character in attribute name.");
  1424. }
  1425. }
  1426. /** @private */
  1427. pushAttribNS(name, value) {
  1428. const { prefix, local } = this.qname(name);
  1429. this.attribList.push({ name, prefix, local, value, uri: undefined });
  1430. if (prefix === "xmlns") {
  1431. const trimmed = value.trim();
  1432. this.tag.ns[local] = trimmed;
  1433. nsPairCheck(this, local, trimmed);
  1434. }
  1435. else if (name === "xmlns") {
  1436. const trimmed = value.trim();
  1437. this.tag.ns[""] = trimmed;
  1438. nsPairCheck(this, "", trimmed);
  1439. }
  1440. }
  1441. /** @private */
  1442. pushAttribPlain(name, value) {
  1443. this.attribList.push({ name, value });
  1444. }
  1445. /** @private */
  1446. sAttribName() {
  1447. const c = this.captureNameChars();
  1448. if (c === EQUAL) {
  1449. this.state = S_ATTRIB_VALUE;
  1450. }
  1451. else if (isS(c)) {
  1452. this.state = S_ATTRIB_NAME_SAW_WHITE;
  1453. }
  1454. else if (c === GREATER) {
  1455. this.fail("attribute without value.");
  1456. this.pushAttrib(this.name, this.name);
  1457. this.name = this.text = "";
  1458. this.openTag();
  1459. }
  1460. else if (c) {
  1461. this.fail("disallowed character in attribute name.");
  1462. }
  1463. }
  1464. /** @private */
  1465. sAttribNameSawWhite() {
  1466. const c = this.skipSpaces();
  1467. if (!c) {
  1468. return;
  1469. }
  1470. if (c === EQUAL) {
  1471. this.state = S_ATTRIB_VALUE;
  1472. }
  1473. else {
  1474. this.fail("attribute without value.");
  1475. this.tag.attributes[this.name] = "";
  1476. this.text = "";
  1477. this.name = "";
  1478. if (c === GREATER) {
  1479. this.openTag();
  1480. }
  1481. else if (isNameStartChar(c)) {
  1482. this.name = String.fromCodePoint(c);
  1483. this.state = S_ATTRIB_NAME;
  1484. }
  1485. else {
  1486. this.fail("disallowed character in attribute name.");
  1487. this.state = S_ATTRIB;
  1488. }
  1489. }
  1490. }
  1491. /** @private */
  1492. sAttribValue() {
  1493. const c = this.getCode();
  1494. if (isQuote(c)) {
  1495. this.q = c;
  1496. this.state = S_ATTRIB_VALUE_QUOTED;
  1497. }
  1498. else if (!isS(c)) {
  1499. this.fail("unquoted attribute value.");
  1500. this.state = S_ATTRIB_VALUE_UNQUOTED;
  1501. this.text = String.fromCodePoint(c);
  1502. }
  1503. }
  1504. /** @private */
  1505. sAttribValueQuoted() {
  1506. // We deliberately do not use captureTo here. The specialized code we use
  1507. // here is faster than using captureTo.
  1508. const { q } = this;
  1509. const { chunk, limit, i: start } = this;
  1510. // eslint-disable-next-line no-constant-condition
  1511. while (true) {
  1512. if (this.i >= limit) {
  1513. // This is faster than adding codepoints one by one.
  1514. this.text += chunk.substring(start);
  1515. return;
  1516. }
  1517. const code = this.getCode();
  1518. if (code === q || code === AMP || code === LESS) {
  1519. // This is faster than adding codepoints one by one.
  1520. const slice = chunk.substring(start,
  1521. this.i - (code <= 0xFFFF ? 1 : 2));
  1522. switch (code) {
  1523. case q:
  1524. this.pushAttrib(this.name, this.text + slice);
  1525. this.name = this.text = "";
  1526. this.q = null;
  1527. this.state = S_ATTRIB_VALUE_CLOSED;
  1528. return;
  1529. case AMP:
  1530. this.text += slice;
  1531. this.state = S_ENTITY;
  1532. this.entityReturnState = S_ATTRIB_VALUE_QUOTED;
  1533. return;
  1534. default:
  1535. this.text += slice;
  1536. this.fail("disallowed character.");
  1537. return;
  1538. }
  1539. }
  1540. }
  1541. }
  1542. /** @private */
  1543. sAttribValueClosed() {
  1544. const c = this.getCode();
  1545. if (isS(c)) {
  1546. this.state = S_ATTRIB;
  1547. }
  1548. else if (c === GREATER) {
  1549. this.openTag();
  1550. }
  1551. else if (c === FORWARD_SLASH) {
  1552. this.state = S_OPEN_TAG_SLASH;
  1553. }
  1554. else if (isNameStartChar(c)) {
  1555. this.fail("no whitespace between attributes.");
  1556. this.name = String.fromCodePoint(c);
  1557. this.state = S_ATTRIB_NAME;
  1558. }
  1559. else {
  1560. this.fail("disallowed character in attribute name.");
  1561. }
  1562. }
  1563. /** @private */
  1564. sAttribValueUnquoted() {
  1565. const c = this.captureTo(ATTRIB_VALUE_UNQUOTED_TERMINATOR, "text");
  1566. if (c === AMP) {
  1567. this.state = S_ENTITY;
  1568. this.entityReturnState = S_ATTRIB_VALUE_UNQUOTED;
  1569. }
  1570. else if (c === LESS) {
  1571. this.fail("disallowed character.");
  1572. }
  1573. else if (c) {
  1574. if (this.text.includes("]]>")) {
  1575. this.fail("the string \"]]>\" is disallowed in char data.");
  1576. }
  1577. this.pushAttrib(this.name, this.text);
  1578. this.name = this.text = "";
  1579. if (c === GREATER) {
  1580. this.openTag();
  1581. }
  1582. else {
  1583. this.state = S_ATTRIB;
  1584. }
  1585. }
  1586. }
  1587. /** @private */
  1588. sCloseTag() {
  1589. const c = this.captureNameChars();
  1590. if (c === GREATER) {
  1591. this.closeTag();
  1592. }
  1593. else if (isS(c)) {
  1594. this.state = S_CLOSE_TAG_SAW_WHITE;
  1595. }
  1596. else if (c) {
  1597. this.fail("disallowed character in closing tag.");
  1598. }
  1599. }
  1600. /** @private */
  1601. sCloseTagSawWhite() {
  1602. const c = this.skipSpaces();
  1603. if (c === GREATER) {
  1604. this.closeTag();
  1605. }
  1606. else if (c) {
  1607. this.fail("disallowed character in closing tag.");
  1608. }
  1609. }
  1610. /** @private */
  1611. sEntity() {
  1612. if (this.captureToChar(SEMICOLON, "entity")) {
  1613. this.state = this.entityReturnState;
  1614. if (this.entity === "") {
  1615. this.fail("empty entity name.");
  1616. this.text += "&;";
  1617. return;
  1618. }
  1619. this.text += this.parseEntity(this.entity);
  1620. this.entity = "";
  1621. }
  1622. }
  1623. // END OF STATE HANDLERS
  1624. /**
  1625. * End parsing. This performs final well-formedness checks and resets the
  1626. * parser to a clean state.
  1627. *
  1628. * @private
  1629. *
  1630. * @returns this
  1631. */
  1632. end() {
  1633. if (!this.sawRoot) {
  1634. this.fail("document must contain a root element.");
  1635. }
  1636. const { tags } = this;
  1637. while (tags.length > 0) {
  1638. const tag = tags.pop();
  1639. this.fail(`unclosed tag: ${tag.name}`);
  1640. }
  1641. if ((this.state !== S_INITIAL) &&
  1642. (this.state !== S_TEXT)) {
  1643. this.fail("unexpected end.");
  1644. }
  1645. if (this.text.length !== 0) {
  1646. this.closeText();
  1647. }
  1648. this.closed = true;
  1649. this.onend();
  1650. this._init(this.opt);
  1651. return this;
  1652. }
  1653. /**
  1654. * If there's text to emit ``ontext``, emit it.
  1655. *
  1656. * @private
  1657. */
  1658. closeText() {
  1659. this.ontext(this.text);
  1660. this.text = "";
  1661. }
  1662. /**
  1663. * Resolve a namespace prefix.
  1664. *
  1665. * @param {string} prefix The prefix to resolve.
  1666. *
  1667. * @returns {string|undefined} The namespace URI or ``undefined`` if the
  1668. * prefix is not defined.
  1669. */
  1670. resolve(prefix) {
  1671. let uri = this.tag.ns[prefix];
  1672. if (uri !== undefined) {
  1673. return uri;
  1674. }
  1675. const { tags } = this;
  1676. for (let index = tags.length - 1; index >= 0; index--) {
  1677. uri = tags[index].ns[prefix];
  1678. if (uri !== undefined) {
  1679. return uri;
  1680. }
  1681. }
  1682. uri = this.ns[prefix];
  1683. if (uri) {
  1684. return uri;
  1685. }
  1686. const { resolvePrefix } = this.opt;
  1687. return resolvePrefix ? resolvePrefix(prefix) : undefined;
  1688. }
  1689. /**
  1690. * Parse a qname into its prefix and local name parts.
  1691. *
  1692. * @private
  1693. *
  1694. * @param {string} name The name to parse
  1695. *
  1696. * @returns {{prefix: string, local: string}}
  1697. */
  1698. qname(name) {
  1699. const colon = name.indexOf(":");
  1700. if (colon === -1) {
  1701. return { prefix: "", local: name };
  1702. }
  1703. const local = name.substring(colon + 1);
  1704. const prefix = name.substring(0, colon);
  1705. if (prefix === "" || local === "" || local.includes(":")) {
  1706. this.fail(`malformed name: ${name}.`);
  1707. }
  1708. return { prefix, local };
  1709. }
  1710. /** @private */
  1711. processAttribsNS() {
  1712. const { tag, attribList } = this;
  1713. const { name: tagName, attributes } = tag;
  1714. {
  1715. // add namespace info to tag
  1716. const { prefix, local } = this.qname(tagName);
  1717. tag.prefix = prefix;
  1718. tag.local = local;
  1719. const uri = tag.uri = this.resolve(prefix) || "";
  1720. if (prefix) {
  1721. if (prefix === "xmlns") {
  1722. this.fail("tags may not have \"xmlns\" as prefix.");
  1723. }
  1724. if (!uri) {
  1725. this.fail(`unbound namespace prefix: ${JSON.stringify(prefix)}.`);
  1726. tag.uri = prefix;
  1727. }
  1728. }
  1729. }
  1730. if (attribList.length === 0) {
  1731. return;
  1732. }
  1733. const seen = new Set();
  1734. // Note: do not apply default ns to attributes:
  1735. // http://www.w3.org/TR/REC-xml-names/#defaulting
  1736. for (const attr of attribList) {
  1737. const { name, prefix, local } = attr;
  1738. let uri;
  1739. let eqname;
  1740. if (prefix === "") {
  1741. uri = (name === "xmlns") ? XMLNS_NAMESPACE : "";
  1742. eqname = name;
  1743. }
  1744. else {
  1745. uri = this.resolve(prefix);
  1746. // if there's any attributes with an undefined namespace,
  1747. // then fail on them now.
  1748. if (!uri) {
  1749. this.fail(`unbound namespace prefix: ${JSON.stringify(prefix)}.`);
  1750. uri = prefix;
  1751. }
  1752. eqname = `{${uri}}${local}`;
  1753. }
  1754. if (seen.has(eqname)) {
  1755. this.fail(`duplicate attribute: ${eqname}.`);
  1756. }
  1757. seen.add(eqname);
  1758. attr.uri = uri;
  1759. attributes[name] = attr;
  1760. }
  1761. this.attribList = [];
  1762. }
  1763. /** @private */
  1764. processAttribsPlain() {
  1765. const { attribList, tag: { attributes } } = this;
  1766. for (const { name, value } of attribList) {
  1767. if (attributes[name]) {
  1768. this.fail(`duplicate attribute: ${name}.`);
  1769. }
  1770. attributes[name] = value;
  1771. }
  1772. this.attribList = [];
  1773. }
  1774. /**
  1775. * Handle a complete open tag. This parser code calls this once it has seen
  1776. * the whole tag. This method checks for well-formeness and then emits
  1777. * ``onopentag``.
  1778. *
  1779. * @private
  1780. */
  1781. openTag() {
  1782. this.processAttribs();
  1783. const { tag, tags } = this;
  1784. tag.isSelfClosing = false;
  1785. // There cannot be any pending text here due to the onopentagstart that was
  1786. // necessarily emitted before we get here. So we do not check text.
  1787. this.onopentag(tag);
  1788. tags.push(tag);
  1789. this.state = S_TEXT;
  1790. this.name = "";
  1791. }
  1792. /**
  1793. * Handle a complete self-closing tag. This parser code calls this once it has
  1794. * seen the whole tag. This method checks for well-formeness and then emits
  1795. * ``onopentag`` and ``onclosetag``.
  1796. *
  1797. * @private
  1798. */
  1799. openSelfClosingTag() {
  1800. this.processAttribs();
  1801. const { tag, tags } = this;
  1802. tag.isSelfClosing = true;
  1803. // There cannot be any pending text here due to the onopentagstart that was
  1804. // necessarily emitted before we get here. So we do not check text.
  1805. this.onopentag(tag);
  1806. this.onclosetag(tag);
  1807. const top = this.tag = tags[tags.length - 1];
  1808. if (!top) {
  1809. this.closedRoot = true;
  1810. }
  1811. this.state = S_TEXT;
  1812. this.name = "";
  1813. }
  1814. /**
  1815. * Handle a complete close tag. This parser code calls this once it has seen
  1816. * the whole tag. This method checks for well-formeness and then emits
  1817. * ``onclosetag``.
  1818. *
  1819. * @private
  1820. */
  1821. closeTag() {
  1822. const { tags, name } = this;
  1823. // Our state after this will be S_TEXT, no matter what, and we can clear
  1824. // tagName now.
  1825. this.state = S_TEXT;
  1826. this.name = "";
  1827. if (!name) {
  1828. this.fail("weird empty close tag.");
  1829. this.text += "</>";
  1830. return;
  1831. }
  1832. let l = tags.length;
  1833. while (l-- > 0) {
  1834. const tag = this.tag = tags.pop();
  1835. if (this.text.length !== 0) {
  1836. this.closeText();
  1837. }
  1838. this.onclosetag(tag);
  1839. if (tag.name === name) {
  1840. break;
  1841. }
  1842. this.fail("unexpected close tag.");
  1843. }
  1844. if (l === 0) {
  1845. this.closedRoot = true;
  1846. }
  1847. else if (l < 0) {
  1848. this.fail(`unmatched closing tag: ${name}.`);
  1849. this.text += `</${name}>`;
  1850. }
  1851. }
  1852. /**
  1853. * Resolves an entity. Makes any necessary well-formedness checks.
  1854. *
  1855. * @private
  1856. *
  1857. * @param {string} entity The entity to resolve.
  1858. *
  1859. * @returns {string} The parsed entity.
  1860. */
  1861. parseEntity(entity) {
  1862. if (entity[0] !== "#") {
  1863. const defined = this.ENTITIES[entity];
  1864. if (defined) {
  1865. return defined;
  1866. }
  1867. this.fail(this.isName(entity) ? "undefined entity." :
  1868. "disallowed character in entity name.");
  1869. return `&${entity};`;
  1870. }
  1871. let num = NaN;
  1872. if (entity[1] === "x" && /^#x[0-9a-f]+$/i.test(entity)) {
  1873. num = parseInt(entity.slice(2), 16);
  1874. }
  1875. else if (/^#[0-9]+$/.test(entity)) {
  1876. num = parseInt(entity.slice(1), 10);
  1877. }
  1878. // The character reference is required to match the CHAR production.
  1879. if (!isChar(num)) {
  1880. this.fail("malformed character entity.");
  1881. return `&${entity};`;
  1882. }
  1883. return String.fromCodePoint(num);
  1884. }
  1885. }
  1886. exports.SaxesParser = SaxesParser;