index.js 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144
  1. 'use strict';
  2. var Preprocessor = require('./preprocessor'),
  3. UNICODE = require('../common/unicode'),
  4. neTree = require('./named_entity_data');
  5. //Aliases
  6. var $ = UNICODE.CODE_POINTS,
  7. $$ = UNICODE.CODE_POINT_SEQUENCES;
  8. //Replacement code points for numeric entities
  9. var NUMERIC_ENTITY_REPLACEMENTS = {
  10. 0x00: 0xFFFD, 0x0D: 0x000D, 0x80: 0x20AC, 0x81: 0x0081, 0x82: 0x201A, 0x83: 0x0192, 0x84: 0x201E,
  11. 0x85: 0x2026, 0x86: 0x2020, 0x87: 0x2021, 0x88: 0x02C6, 0x89: 0x2030, 0x8A: 0x0160, 0x8B: 0x2039,
  12. 0x8C: 0x0152, 0x8D: 0x008D, 0x8E: 0x017D, 0x8F: 0x008F, 0x90: 0x0090, 0x91: 0x2018, 0x92: 0x2019,
  13. 0x93: 0x201C, 0x94: 0x201D, 0x95: 0x2022, 0x96: 0x2013, 0x97: 0x2014, 0x98: 0x02DC, 0x99: 0x2122,
  14. 0x9A: 0x0161, 0x9B: 0x203A, 0x9C: 0x0153, 0x9D: 0x009D, 0x9E: 0x017E, 0x9F: 0x0178
  15. };
  16. // Named entity tree flags
  17. var HAS_DATA_FLAG = 1 << 0;
  18. var DATA_DUPLET_FLAG = 1 << 1;
  19. var HAS_BRANCHES_FLAG = 1 << 2;
  20. var MAX_BRANCH_MARKER_VALUE = HAS_DATA_FLAG | DATA_DUPLET_FLAG | HAS_BRANCHES_FLAG;
  21. //States
  22. var DATA_STATE = 'DATA_STATE',
  23. CHARACTER_REFERENCE_IN_DATA_STATE = 'CHARACTER_REFERENCE_IN_DATA_STATE',
  24. RCDATA_STATE = 'RCDATA_STATE',
  25. CHARACTER_REFERENCE_IN_RCDATA_STATE = 'CHARACTER_REFERENCE_IN_RCDATA_STATE',
  26. RAWTEXT_STATE = 'RAWTEXT_STATE',
  27. SCRIPT_DATA_STATE = 'SCRIPT_DATA_STATE',
  28. PLAINTEXT_STATE = 'PLAINTEXT_STATE',
  29. TAG_OPEN_STATE = 'TAG_OPEN_STATE',
  30. END_TAG_OPEN_STATE = 'END_TAG_OPEN_STATE',
  31. TAG_NAME_STATE = 'TAG_NAME_STATE',
  32. RCDATA_LESS_THAN_SIGN_STATE = 'RCDATA_LESS_THAN_SIGN_STATE',
  33. RCDATA_END_TAG_OPEN_STATE = 'RCDATA_END_TAG_OPEN_STATE',
  34. RCDATA_END_TAG_NAME_STATE = 'RCDATA_END_TAG_NAME_STATE',
  35. RAWTEXT_LESS_THAN_SIGN_STATE = 'RAWTEXT_LESS_THAN_SIGN_STATE',
  36. RAWTEXT_END_TAG_OPEN_STATE = 'RAWTEXT_END_TAG_OPEN_STATE',
  37. RAWTEXT_END_TAG_NAME_STATE = 'RAWTEXT_END_TAG_NAME_STATE',
  38. SCRIPT_DATA_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_LESS_THAN_SIGN_STATE',
  39. SCRIPT_DATA_END_TAG_OPEN_STATE = 'SCRIPT_DATA_END_TAG_OPEN_STATE',
  40. SCRIPT_DATA_END_TAG_NAME_STATE = 'SCRIPT_DATA_END_TAG_NAME_STATE',
  41. SCRIPT_DATA_ESCAPE_START_STATE = 'SCRIPT_DATA_ESCAPE_START_STATE',
  42. SCRIPT_DATA_ESCAPE_START_DASH_STATE = 'SCRIPT_DATA_ESCAPE_START_DASH_STATE',
  43. SCRIPT_DATA_ESCAPED_STATE = 'SCRIPT_DATA_ESCAPED_STATE',
  44. SCRIPT_DATA_ESCAPED_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_STATE',
  45. SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_DASH_STATE',
  46. SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE',
  47. SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE',
  48. SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE',
  49. SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE',
  50. SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_STATE',
  51. SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE',
  52. SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE',
  53. SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE',
  54. SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE',
  55. BEFORE_ATTRIBUTE_NAME_STATE = 'BEFORE_ATTRIBUTE_NAME_STATE',
  56. ATTRIBUTE_NAME_STATE = 'ATTRIBUTE_NAME_STATE',
  57. AFTER_ATTRIBUTE_NAME_STATE = 'AFTER_ATTRIBUTE_NAME_STATE',
  58. BEFORE_ATTRIBUTE_VALUE_STATE = 'BEFORE_ATTRIBUTE_VALUE_STATE',
  59. ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE',
  60. ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE',
  61. ATTRIBUTE_VALUE_UNQUOTED_STATE = 'ATTRIBUTE_VALUE_UNQUOTED_STATE',
  62. CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE = 'CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE',
  63. AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 'AFTER_ATTRIBUTE_VALUE_QUOTED_STATE',
  64. SELF_CLOSING_START_TAG_STATE = 'SELF_CLOSING_START_TAG_STATE',
  65. BOGUS_COMMENT_STATE = 'BOGUS_COMMENT_STATE',
  66. BOGUS_COMMENT_STATE_CONTINUATION = 'BOGUS_COMMENT_STATE_CONTINUATION',
  67. MARKUP_DECLARATION_OPEN_STATE = 'MARKUP_DECLARATION_OPEN_STATE',
  68. COMMENT_START_STATE = 'COMMENT_START_STATE',
  69. COMMENT_START_DASH_STATE = 'COMMENT_START_DASH_STATE',
  70. COMMENT_STATE = 'COMMENT_STATE',
  71. COMMENT_END_DASH_STATE = 'COMMENT_END_DASH_STATE',
  72. COMMENT_END_STATE = 'COMMENT_END_STATE',
  73. COMMENT_END_BANG_STATE = 'COMMENT_END_BANG_STATE',
  74. DOCTYPE_STATE = 'DOCTYPE_STATE',
  75. DOCTYPE_NAME_STATE = 'DOCTYPE_NAME_STATE',
  76. AFTER_DOCTYPE_NAME_STATE = 'AFTER_DOCTYPE_NAME_STATE',
  77. BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE',
  78. DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE',
  79. DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE',
  80. BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 'BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE',
  81. BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE',
  82. DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE',
  83. DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE',
  84. AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE',
  85. BOGUS_DOCTYPE_STATE = 'BOGUS_DOCTYPE_STATE',
  86. CDATA_SECTION_STATE = 'CDATA_SECTION_STATE';
  87. //Utils
  88. //OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline
  89. //this functions if they will be situated in another module due to context switch.
  90. //Always perform inlining check before modifying this functions ('node --trace-inlining').
  91. function isWhitespace(cp) {
  92. return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED;
  93. }
  94. function isAsciiDigit(cp) {
  95. return cp >= $.DIGIT_0 && cp <= $.DIGIT_9;
  96. }
  97. function isAsciiUpper(cp) {
  98. return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z;
  99. }
  100. function isAsciiLower(cp) {
  101. return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z;
  102. }
  103. function isAsciiLetter(cp) {
  104. return isAsciiLower(cp) || isAsciiUpper(cp);
  105. }
  106. function isAsciiAlphaNumeric(cp) {
  107. return isAsciiLetter(cp) || isAsciiDigit(cp);
  108. }
  109. function isDigit(cp, isHex) {
  110. return isAsciiDigit(cp) || isHex && (cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F ||
  111. cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F);
  112. }
  113. function isReservedCodePoint(cp) {
  114. return cp >= 0xD800 && cp <= 0xDFFF || cp > 0x10FFFF;
  115. }
  116. function toAsciiLowerCodePoint(cp) {
  117. return cp + 0x0020;
  118. }
  119. //NOTE: String.fromCharCode() function can handle only characters from BMP subset.
  120. //So, we need to workaround this manually.
  121. //(see: https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/fromCharCode#Getting_it_to_work_with_higher_values)
  122. function toChar(cp) {
  123. if (cp <= 0xFFFF)
  124. return String.fromCharCode(cp);
  125. cp -= 0x10000;
  126. return String.fromCharCode(cp >>> 10 & 0x3FF | 0xD800) + String.fromCharCode(0xDC00 | cp & 0x3FF);
  127. }
  128. function toAsciiLowerChar(cp) {
  129. return String.fromCharCode(toAsciiLowerCodePoint(cp));
  130. }
  131. function findNamedEntityTreeBranch(nodeIx, cp) {
  132. var branchCount = neTree[++nodeIx],
  133. lo = ++nodeIx,
  134. hi = lo + branchCount - 1;
  135. while (lo <= hi) {
  136. var mid = lo + hi >>> 1,
  137. midCp = neTree[mid];
  138. if (midCp < cp)
  139. lo = mid + 1;
  140. else if (midCp > cp)
  141. hi = mid - 1;
  142. else
  143. return neTree[mid + branchCount];
  144. }
  145. return -1;
  146. }
  147. //Tokenizer
  148. var Tokenizer = module.exports = function () {
  149. this.preprocessor = new Preprocessor();
  150. this.tokenQueue = [];
  151. this.allowCDATA = false;
  152. this.state = DATA_STATE;
  153. this.returnState = '';
  154. this.tempBuff = [];
  155. this.additionalAllowedCp = void 0;
  156. this.lastStartTagName = '';
  157. this.consumedAfterSnapshot = -1;
  158. this.active = false;
  159. this.currentCharacterToken = null;
  160. this.currentToken = null;
  161. this.currentAttr = null;
  162. };
  163. //Token types
  164. Tokenizer.CHARACTER_TOKEN = 'CHARACTER_TOKEN';
  165. Tokenizer.NULL_CHARACTER_TOKEN = 'NULL_CHARACTER_TOKEN';
  166. Tokenizer.WHITESPACE_CHARACTER_TOKEN = 'WHITESPACE_CHARACTER_TOKEN';
  167. Tokenizer.START_TAG_TOKEN = 'START_TAG_TOKEN';
  168. Tokenizer.END_TAG_TOKEN = 'END_TAG_TOKEN';
  169. Tokenizer.COMMENT_TOKEN = 'COMMENT_TOKEN';
  170. Tokenizer.DOCTYPE_TOKEN = 'DOCTYPE_TOKEN';
  171. Tokenizer.EOF_TOKEN = 'EOF_TOKEN';
  172. Tokenizer.HIBERNATION_TOKEN = 'HIBERNATION_TOKEN';
  173. //Tokenizer initial states for different modes
  174. Tokenizer.MODE = {
  175. DATA: DATA_STATE,
  176. RCDATA: RCDATA_STATE,
  177. RAWTEXT: RAWTEXT_STATE,
  178. SCRIPT_DATA: SCRIPT_DATA_STATE,
  179. PLAINTEXT: PLAINTEXT_STATE
  180. };
  181. //Static
  182. Tokenizer.getTokenAttr = function (token, attrName) {
  183. for (var i = token.attrs.length - 1; i >= 0; i--) {
  184. if (token.attrs[i].name === attrName)
  185. return token.attrs[i].value;
  186. }
  187. return null;
  188. };
  189. //API
  190. Tokenizer.prototype.getNextToken = function () {
  191. while (!this.tokenQueue.length && this.active) {
  192. this._hibernationSnapshot();
  193. var cp = this._consume();
  194. if (!this._ensureHibernation())
  195. this[this.state](cp);
  196. }
  197. return this.tokenQueue.shift();
  198. };
  199. Tokenizer.prototype.write = function (chunk, isLastChunk) {
  200. this.active = true;
  201. this.preprocessor.write(chunk, isLastChunk);
  202. };
  203. Tokenizer.prototype.insertHtmlAtCurrentPos = function (chunk) {
  204. this.active = true;
  205. this.preprocessor.insertHtmlAtCurrentPos(chunk);
  206. };
  207. //Hibernation
  208. Tokenizer.prototype._hibernationSnapshot = function () {
  209. this.consumedAfterSnapshot = 0;
  210. };
  211. Tokenizer.prototype._ensureHibernation = function () {
  212. if (this.preprocessor.endOfChunkHit) {
  213. for (; this.consumedAfterSnapshot > 0; this.consumedAfterSnapshot--)
  214. this.preprocessor.retreat();
  215. this.active = false;
  216. this.tokenQueue.push({type: Tokenizer.HIBERNATION_TOKEN});
  217. return true;
  218. }
  219. return false;
  220. };
  221. //Consumption
  222. Tokenizer.prototype._consume = function () {
  223. this.consumedAfterSnapshot++;
  224. return this.preprocessor.advance();
  225. };
  226. Tokenizer.prototype._unconsume = function () {
  227. this.consumedAfterSnapshot--;
  228. this.preprocessor.retreat();
  229. };
  230. Tokenizer.prototype._unconsumeSeveral = function (count) {
  231. while (count--)
  232. this._unconsume();
  233. };
  234. Tokenizer.prototype._reconsumeInState = function (state) {
  235. this.state = state;
  236. this._unconsume();
  237. };
  238. Tokenizer.prototype._consumeSubsequentIfMatch = function (pattern, startCp, caseSensitive) {
  239. var consumedCount = 0,
  240. isMatch = true,
  241. patternLength = pattern.length,
  242. patternPos = 0,
  243. cp = startCp,
  244. patternCp = void 0;
  245. for (; patternPos < patternLength; patternPos++) {
  246. if (patternPos > 0) {
  247. cp = this._consume();
  248. consumedCount++;
  249. }
  250. if (cp === $.EOF) {
  251. isMatch = false;
  252. break;
  253. }
  254. patternCp = pattern[patternPos];
  255. if (cp !== patternCp && (caseSensitive || cp !== toAsciiLowerCodePoint(patternCp))) {
  256. isMatch = false;
  257. break;
  258. }
  259. }
  260. if (!isMatch)
  261. this._unconsumeSeveral(consumedCount);
  262. return isMatch;
  263. };
  264. //Lookahead
  265. Tokenizer.prototype._lookahead = function () {
  266. var cp = this._consume();
  267. this._unconsume();
  268. return cp;
  269. };
  270. //Temp buffer
  271. Tokenizer.prototype.isTempBufferEqualToScriptString = function () {
  272. if (this.tempBuff.length !== $$.SCRIPT_STRING.length)
  273. return false;
  274. for (var i = 0; i < this.tempBuff.length; i++) {
  275. if (this.tempBuff[i] !== $$.SCRIPT_STRING[i])
  276. return false;
  277. }
  278. return true;
  279. };
  280. //Token creation
  281. Tokenizer.prototype._createStartTagToken = function () {
  282. this.currentToken = {
  283. type: Tokenizer.START_TAG_TOKEN,
  284. tagName: '',
  285. selfClosing: false,
  286. attrs: []
  287. };
  288. };
  289. Tokenizer.prototype._createEndTagToken = function () {
  290. this.currentToken = {
  291. type: Tokenizer.END_TAG_TOKEN,
  292. tagName: '',
  293. attrs: []
  294. };
  295. };
  296. Tokenizer.prototype._createCommentToken = function () {
  297. this.currentToken = {
  298. type: Tokenizer.COMMENT_TOKEN,
  299. data: ''
  300. };
  301. };
  302. Tokenizer.prototype._createDoctypeToken = function (initialName) {
  303. this.currentToken = {
  304. type: Tokenizer.DOCTYPE_TOKEN,
  305. name: initialName,
  306. forceQuirks: false,
  307. publicId: null,
  308. systemId: null
  309. };
  310. };
  311. Tokenizer.prototype._createCharacterToken = function (type, ch) {
  312. this.currentCharacterToken = {
  313. type: type,
  314. chars: ch
  315. };
  316. };
  317. //Tag attributes
  318. Tokenizer.prototype._createAttr = function (attrNameFirstCh) {
  319. this.currentAttr = {
  320. name: attrNameFirstCh,
  321. value: ''
  322. };
  323. };
  324. Tokenizer.prototype._isDuplicateAttr = function () {
  325. return Tokenizer.getTokenAttr(this.currentToken, this.currentAttr.name) !== null;
  326. };
  327. Tokenizer.prototype._leaveAttrName = function (toState) {
  328. this.state = toState;
  329. if (!this._isDuplicateAttr())
  330. this.currentToken.attrs.push(this.currentAttr);
  331. };
  332. Tokenizer.prototype._leaveAttrValue = function (toState) {
  333. this.state = toState;
  334. };
  335. //Appropriate end tag token
  336. //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#appropriate-end-tag-token)
  337. Tokenizer.prototype._isAppropriateEndTagToken = function () {
  338. return this.lastStartTagName === this.currentToken.tagName;
  339. };
  340. //Token emission
  341. Tokenizer.prototype._emitCurrentToken = function () {
  342. this._emitCurrentCharacterToken();
  343. //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
  344. if (this.currentToken.type === Tokenizer.START_TAG_TOKEN)
  345. this.lastStartTagName = this.currentToken.tagName;
  346. this.tokenQueue.push(this.currentToken);
  347. this.currentToken = null;
  348. };
  349. Tokenizer.prototype._emitCurrentCharacterToken = function () {
  350. if (this.currentCharacterToken) {
  351. this.tokenQueue.push(this.currentCharacterToken);
  352. this.currentCharacterToken = null;
  353. }
  354. };
  355. Tokenizer.prototype._emitEOFToken = function () {
  356. this._emitCurrentCharacterToken();
  357. this.tokenQueue.push({type: Tokenizer.EOF_TOKEN});
  358. };
  359. //Characters emission
  360. //OPTIMIZATION: specification uses only one type of character tokens (one token per character).
  361. //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters.
  362. //If we have a sequence of characters that belong to the same group, parser can process it
  363. //as a single solid character token.
  364. //So, there are 3 types of character tokens in parse5:
  365. //1)NULL_CHARACTER_TOKEN - \u0000-character sequences (e.g. '\u0000\u0000\u0000')
  366. //2)WHITESPACE_CHARACTER_TOKEN - any whitespace/new-line character sequences (e.g. '\n \r\t \f')
  367. //3)CHARACTER_TOKEN - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
  368. Tokenizer.prototype._appendCharToCurrentCharacterToken = function (type, ch) {
  369. if (this.currentCharacterToken && this.currentCharacterToken.type !== type)
  370. this._emitCurrentCharacterToken();
  371. if (this.currentCharacterToken)
  372. this.currentCharacterToken.chars += ch;
  373. else
  374. this._createCharacterToken(type, ch);
  375. };
  376. Tokenizer.prototype._emitCodePoint = function (cp) {
  377. var type = Tokenizer.CHARACTER_TOKEN;
  378. if (isWhitespace(cp))
  379. type = Tokenizer.WHITESPACE_CHARACTER_TOKEN;
  380. else if (cp === $.NULL)
  381. type = Tokenizer.NULL_CHARACTER_TOKEN;
  382. this._appendCharToCurrentCharacterToken(type, toChar(cp));
  383. };
  384. Tokenizer.prototype._emitSeveralCodePoints = function (codePoints) {
  385. for (var i = 0; i < codePoints.length; i++)
  386. this._emitCodePoint(codePoints[i]);
  387. };
  388. //NOTE: used then we emit character explicitly. This is always a non-whitespace and a non-null character.
  389. //So we can avoid additional checks here.
  390. Tokenizer.prototype._emitChar = function (ch) {
  391. this._appendCharToCurrentCharacterToken(Tokenizer.CHARACTER_TOKEN, ch);
  392. };
  393. //Character reference tokenization
  394. Tokenizer.prototype._consumeNumericEntity = function (isHex) {
  395. var digits = '',
  396. nextCp = void 0;
  397. do {
  398. digits += toChar(this._consume());
  399. nextCp = this._lookahead();
  400. } while (nextCp !== $.EOF && isDigit(nextCp, isHex));
  401. if (this._lookahead() === $.SEMICOLON)
  402. this._consume();
  403. var referencedCp = parseInt(digits, isHex ? 16 : 10),
  404. replacement = NUMERIC_ENTITY_REPLACEMENTS[referencedCp];
  405. if (replacement)
  406. return replacement;
  407. if (isReservedCodePoint(referencedCp))
  408. return $.REPLACEMENT_CHARACTER;
  409. return referencedCp;
  410. };
  411. // NOTE: for the details on this algorithm see
  412. // https://github.com/inikulin/parse5/tree/master/scripts/generate_named_entity_data/README.md
  413. Tokenizer.prototype._consumeNamedEntity = function (inAttr) {
  414. var referencedCodePoints = null,
  415. referenceSize = 0,
  416. cp = null,
  417. consumedCount = 0,
  418. semicolonTerminated = false;
  419. for (var i = 0; i > -1;) {
  420. var current = neTree[i],
  421. inNode = current < MAX_BRANCH_MARKER_VALUE,
  422. nodeWithData = inNode && current & HAS_DATA_FLAG;
  423. if (nodeWithData) {
  424. referencedCodePoints = current & DATA_DUPLET_FLAG ? [neTree[++i], neTree[++i]] : [neTree[++i]];
  425. referenceSize = consumedCount;
  426. if (cp === $.SEMICOLON) {
  427. semicolonTerminated = true;
  428. break;
  429. }
  430. }
  431. cp = this._consume();
  432. consumedCount++;
  433. if (cp === $.EOF)
  434. break;
  435. if (inNode)
  436. i = current & HAS_BRANCHES_FLAG ? findNamedEntityTreeBranch(i, cp) : -1;
  437. else
  438. i = cp === current ? ++i : -1;
  439. }
  440. if (referencedCodePoints) {
  441. if (!semicolonTerminated) {
  442. //NOTE: unconsume excess (e.g. 'it' in '&notit')
  443. this._unconsumeSeveral(consumedCount - referenceSize);
  444. //NOTE: If the character reference is being consumed as part of an attribute and the next character
  445. //is either a U+003D EQUALS SIGN character (=) or an alphanumeric ASCII character, then, for historical
  446. //reasons, all the characters that were matched after the U+0026 AMPERSAND character (&) must be
  447. //unconsumed, and nothing is returned.
  448. //However, if this next character is in fact a U+003D EQUALS SIGN character (=), then this is a
  449. //parse error, because some legacy user agents will misinterpret the markup in those cases.
  450. //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references)
  451. if (inAttr) {
  452. var nextCp = this._lookahead();
  453. if (nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp)) {
  454. this._unconsumeSeveral(referenceSize);
  455. return null;
  456. }
  457. }
  458. }
  459. return referencedCodePoints;
  460. }
  461. this._unconsumeSeveral(consumedCount);
  462. return null;
  463. };
  464. Tokenizer.prototype._consumeCharacterReference = function (startCp, inAttr) {
  465. if (isWhitespace(startCp) || startCp === $.GREATER_THAN_SIGN ||
  466. startCp === $.AMPERSAND || startCp === this.additionalAllowedCp || startCp === $.EOF) {
  467. //NOTE: not a character reference. No characters are consumed, and nothing is returned.
  468. this._unconsume();
  469. return null;
  470. }
  471. if (startCp === $.NUMBER_SIGN) {
  472. //NOTE: we have a numeric entity candidate, now we should determine if it's hex or decimal
  473. var isHex = false,
  474. nextCp = this._lookahead();
  475. if (nextCp === $.LATIN_SMALL_X || nextCp === $.LATIN_CAPITAL_X) {
  476. this._consume();
  477. isHex = true;
  478. }
  479. nextCp = this._lookahead();
  480. //NOTE: if we have at least one digit this is a numeric entity for sure, so we consume it
  481. if (nextCp !== $.EOF && isDigit(nextCp, isHex))
  482. return [this._consumeNumericEntity(isHex)];
  483. //NOTE: otherwise this is a bogus number entity and a parse error. Unconsume the number sign
  484. //and the 'x'-character if appropriate.
  485. this._unconsumeSeveral(isHex ? 2 : 1);
  486. return null;
  487. }
  488. this._unconsume();
  489. return this._consumeNamedEntity(inAttr);
  490. };
  491. //State machine
  492. var _ = Tokenizer.prototype;
  493. //12.2.4.1 Data state
  494. //------------------------------------------------------------------
  495. _[DATA_STATE] = function dataState(cp) {
  496. this.preprocessor.dropParsedChunk();
  497. if (cp === $.AMPERSAND)
  498. this.state = CHARACTER_REFERENCE_IN_DATA_STATE;
  499. else if (cp === $.LESS_THAN_SIGN)
  500. this.state = TAG_OPEN_STATE;
  501. else if (cp === $.NULL)
  502. this._emitCodePoint(cp);
  503. else if (cp === $.EOF)
  504. this._emitEOFToken();
  505. else
  506. this._emitCodePoint(cp);
  507. };
  508. //12.2.4.2 Character reference in data state
  509. //------------------------------------------------------------------
  510. _[CHARACTER_REFERENCE_IN_DATA_STATE] = function characterReferenceInDataState(cp) {
  511. this.additionalAllowedCp = void 0;
  512. var referencedCodePoints = this._consumeCharacterReference(cp, false);
  513. if (!this._ensureHibernation()) {
  514. if (referencedCodePoints)
  515. this._emitSeveralCodePoints(referencedCodePoints);
  516. else
  517. this._emitChar('&');
  518. this.state = DATA_STATE;
  519. }
  520. };
  521. //12.2.4.3 RCDATA state
  522. //------------------------------------------------------------------
  523. _[RCDATA_STATE] = function rcdataState(cp) {
  524. this.preprocessor.dropParsedChunk();
  525. if (cp === $.AMPERSAND)
  526. this.state = CHARACTER_REFERENCE_IN_RCDATA_STATE;
  527. else if (cp === $.LESS_THAN_SIGN)
  528. this.state = RCDATA_LESS_THAN_SIGN_STATE;
  529. else if (cp === $.NULL)
  530. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  531. else if (cp === $.EOF)
  532. this._emitEOFToken();
  533. else
  534. this._emitCodePoint(cp);
  535. };
  536. //12.2.4.4 Character reference in RCDATA state
  537. //------------------------------------------------------------------
  538. _[CHARACTER_REFERENCE_IN_RCDATA_STATE] = function characterReferenceInRcdataState(cp) {
  539. this.additionalAllowedCp = void 0;
  540. var referencedCodePoints = this._consumeCharacterReference(cp, false);
  541. if (!this._ensureHibernation()) {
  542. if (referencedCodePoints)
  543. this._emitSeveralCodePoints(referencedCodePoints);
  544. else
  545. this._emitChar('&');
  546. this.state = RCDATA_STATE;
  547. }
  548. };
  549. //12.2.4.5 RAWTEXT state
  550. //------------------------------------------------------------------
  551. _[RAWTEXT_STATE] = function rawtextState(cp) {
  552. this.preprocessor.dropParsedChunk();
  553. if (cp === $.LESS_THAN_SIGN)
  554. this.state = RAWTEXT_LESS_THAN_SIGN_STATE;
  555. else if (cp === $.NULL)
  556. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  557. else if (cp === $.EOF)
  558. this._emitEOFToken();
  559. else
  560. this._emitCodePoint(cp);
  561. };
  562. //12.2.4.6 Script data state
  563. //------------------------------------------------------------------
  564. _[SCRIPT_DATA_STATE] = function scriptDataState(cp) {
  565. this.preprocessor.dropParsedChunk();
  566. if (cp === $.LESS_THAN_SIGN)
  567. this.state = SCRIPT_DATA_LESS_THAN_SIGN_STATE;
  568. else if (cp === $.NULL)
  569. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  570. else if (cp === $.EOF)
  571. this._emitEOFToken();
  572. else
  573. this._emitCodePoint(cp);
  574. };
  575. //12.2.4.7 PLAINTEXT state
  576. //------------------------------------------------------------------
  577. _[PLAINTEXT_STATE] = function plaintextState(cp) {
  578. this.preprocessor.dropParsedChunk();
  579. if (cp === $.NULL)
  580. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  581. else if (cp === $.EOF)
  582. this._emitEOFToken();
  583. else
  584. this._emitCodePoint(cp);
  585. };
  586. //12.2.4.8 Tag open state
  587. //------------------------------------------------------------------
  588. _[TAG_OPEN_STATE] = function tagOpenState(cp) {
  589. if (cp === $.EXCLAMATION_MARK)
  590. this.state = MARKUP_DECLARATION_OPEN_STATE;
  591. else if (cp === $.SOLIDUS)
  592. this.state = END_TAG_OPEN_STATE;
  593. else if (isAsciiLetter(cp)) {
  594. this._createStartTagToken();
  595. this._reconsumeInState(TAG_NAME_STATE);
  596. }
  597. else if (cp === $.QUESTION_MARK)
  598. this._reconsumeInState(BOGUS_COMMENT_STATE);
  599. else {
  600. this._emitChar('<');
  601. this._reconsumeInState(DATA_STATE);
  602. }
  603. };
  604. //12.2.4.9 End tag open state
  605. //------------------------------------------------------------------
  606. _[END_TAG_OPEN_STATE] = function endTagOpenState(cp) {
  607. if (isAsciiLetter(cp)) {
  608. this._createEndTagToken();
  609. this._reconsumeInState(TAG_NAME_STATE);
  610. }
  611. else if (cp === $.GREATER_THAN_SIGN)
  612. this.state = DATA_STATE;
  613. else if (cp === $.EOF) {
  614. this._reconsumeInState(DATA_STATE);
  615. this._emitChar('<');
  616. this._emitChar('/');
  617. }
  618. else
  619. this._reconsumeInState(BOGUS_COMMENT_STATE);
  620. };
  621. //12.2.4.10 Tag name state
  622. //------------------------------------------------------------------
  623. _[TAG_NAME_STATE] = function tagNameState(cp) {
  624. if (isWhitespace(cp))
  625. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  626. else if (cp === $.SOLIDUS)
  627. this.state = SELF_CLOSING_START_TAG_STATE;
  628. else if (cp === $.GREATER_THAN_SIGN) {
  629. this.state = DATA_STATE;
  630. this._emitCurrentToken();
  631. }
  632. else if (isAsciiUpper(cp))
  633. this.currentToken.tagName += toAsciiLowerChar(cp);
  634. else if (cp === $.NULL)
  635. this.currentToken.tagName += UNICODE.REPLACEMENT_CHARACTER;
  636. else if (cp === $.EOF)
  637. this._reconsumeInState(DATA_STATE);
  638. else
  639. this.currentToken.tagName += toChar(cp);
  640. };
  641. //12.2.4.11 RCDATA less-than sign state
  642. //------------------------------------------------------------------
  643. _[RCDATA_LESS_THAN_SIGN_STATE] = function rcdataLessThanSignState(cp) {
  644. if (cp === $.SOLIDUS) {
  645. this.tempBuff = [];
  646. this.state = RCDATA_END_TAG_OPEN_STATE;
  647. }
  648. else {
  649. this._emitChar('<');
  650. this._reconsumeInState(RCDATA_STATE);
  651. }
  652. };
  653. //12.2.4.12 RCDATA end tag open state
  654. //------------------------------------------------------------------
  655. _[RCDATA_END_TAG_OPEN_STATE] = function rcdataEndTagOpenState(cp) {
  656. if (isAsciiLetter(cp)) {
  657. this._createEndTagToken();
  658. this._reconsumeInState(RCDATA_END_TAG_NAME_STATE);
  659. }
  660. else {
  661. this._emitChar('<');
  662. this._emitChar('/');
  663. this._reconsumeInState(RCDATA_STATE);
  664. }
  665. };
  666. //12.2.4.13 RCDATA end tag name state
  667. //------------------------------------------------------------------
  668. _[RCDATA_END_TAG_NAME_STATE] = function rcdataEndTagNameState(cp) {
  669. if (isAsciiUpper(cp)) {
  670. this.currentToken.tagName += toAsciiLowerChar(cp);
  671. this.tempBuff.push(cp);
  672. }
  673. else if (isAsciiLower(cp)) {
  674. this.currentToken.tagName += toChar(cp);
  675. this.tempBuff.push(cp);
  676. }
  677. else {
  678. if (this._isAppropriateEndTagToken()) {
  679. if (isWhitespace(cp)) {
  680. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  681. return;
  682. }
  683. if (cp === $.SOLIDUS) {
  684. this.state = SELF_CLOSING_START_TAG_STATE;
  685. return;
  686. }
  687. if (cp === $.GREATER_THAN_SIGN) {
  688. this.state = DATA_STATE;
  689. this._emitCurrentToken();
  690. return;
  691. }
  692. }
  693. this._emitChar('<');
  694. this._emitChar('/');
  695. this._emitSeveralCodePoints(this.tempBuff);
  696. this._reconsumeInState(RCDATA_STATE);
  697. }
  698. };
  699. //12.2.4.14 RAWTEXT less-than sign state
  700. //------------------------------------------------------------------
  701. _[RAWTEXT_LESS_THAN_SIGN_STATE] = function rawtextLessThanSignState(cp) {
  702. if (cp === $.SOLIDUS) {
  703. this.tempBuff = [];
  704. this.state = RAWTEXT_END_TAG_OPEN_STATE;
  705. }
  706. else {
  707. this._emitChar('<');
  708. this._reconsumeInState(RAWTEXT_STATE);
  709. }
  710. };
  711. //12.2.4.15 RAWTEXT end tag open state
  712. //------------------------------------------------------------------
  713. _[RAWTEXT_END_TAG_OPEN_STATE] = function rawtextEndTagOpenState(cp) {
  714. if (isAsciiLetter(cp)) {
  715. this._createEndTagToken();
  716. this._reconsumeInState(RAWTEXT_END_TAG_NAME_STATE);
  717. }
  718. else {
  719. this._emitChar('<');
  720. this._emitChar('/');
  721. this._reconsumeInState(RAWTEXT_STATE);
  722. }
  723. };
  724. //12.2.4.16 RAWTEXT end tag name state
  725. //------------------------------------------------------------------
  726. _[RAWTEXT_END_TAG_NAME_STATE] = function rawtextEndTagNameState(cp) {
  727. if (isAsciiUpper(cp)) {
  728. this.currentToken.tagName += toAsciiLowerChar(cp);
  729. this.tempBuff.push(cp);
  730. }
  731. else if (isAsciiLower(cp)) {
  732. this.currentToken.tagName += toChar(cp);
  733. this.tempBuff.push(cp);
  734. }
  735. else {
  736. if (this._isAppropriateEndTagToken()) {
  737. if (isWhitespace(cp)) {
  738. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  739. return;
  740. }
  741. if (cp === $.SOLIDUS) {
  742. this.state = SELF_CLOSING_START_TAG_STATE;
  743. return;
  744. }
  745. if (cp === $.GREATER_THAN_SIGN) {
  746. this._emitCurrentToken();
  747. this.state = DATA_STATE;
  748. return;
  749. }
  750. }
  751. this._emitChar('<');
  752. this._emitChar('/');
  753. this._emitSeveralCodePoints(this.tempBuff);
  754. this._reconsumeInState(RAWTEXT_STATE);
  755. }
  756. };
  757. //12.2.4.17 Script data less-than sign state
  758. //------------------------------------------------------------------
  759. _[SCRIPT_DATA_LESS_THAN_SIGN_STATE] = function scriptDataLessThanSignState(cp) {
  760. if (cp === $.SOLIDUS) {
  761. this.tempBuff = [];
  762. this.state = SCRIPT_DATA_END_TAG_OPEN_STATE;
  763. }
  764. else if (cp === $.EXCLAMATION_MARK) {
  765. this.state = SCRIPT_DATA_ESCAPE_START_STATE;
  766. this._emitChar('<');
  767. this._emitChar('!');
  768. }
  769. else {
  770. this._emitChar('<');
  771. this._reconsumeInState(SCRIPT_DATA_STATE);
  772. }
  773. };
  774. //12.2.4.18 Script data end tag open state
  775. //------------------------------------------------------------------
  776. _[SCRIPT_DATA_END_TAG_OPEN_STATE] = function scriptDataEndTagOpenState(cp) {
  777. if (isAsciiLetter(cp)) {
  778. this._createEndTagToken();
  779. this._reconsumeInState(SCRIPT_DATA_END_TAG_NAME_STATE);
  780. }
  781. else {
  782. this._emitChar('<');
  783. this._emitChar('/');
  784. this._reconsumeInState(SCRIPT_DATA_STATE);
  785. }
  786. };
  787. //12.2.4.19 Script data end tag name state
  788. //------------------------------------------------------------------
  789. _[SCRIPT_DATA_END_TAG_NAME_STATE] = function scriptDataEndTagNameState(cp) {
  790. if (isAsciiUpper(cp)) {
  791. this.currentToken.tagName += toAsciiLowerChar(cp);
  792. this.tempBuff.push(cp);
  793. }
  794. else if (isAsciiLower(cp)) {
  795. this.currentToken.tagName += toChar(cp);
  796. this.tempBuff.push(cp);
  797. }
  798. else {
  799. if (this._isAppropriateEndTagToken()) {
  800. if (isWhitespace(cp)) {
  801. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  802. return;
  803. }
  804. else if (cp === $.SOLIDUS) {
  805. this.state = SELF_CLOSING_START_TAG_STATE;
  806. return;
  807. }
  808. else if (cp === $.GREATER_THAN_SIGN) {
  809. this._emitCurrentToken();
  810. this.state = DATA_STATE;
  811. return;
  812. }
  813. }
  814. this._emitChar('<');
  815. this._emitChar('/');
  816. this._emitSeveralCodePoints(this.tempBuff);
  817. this._reconsumeInState(SCRIPT_DATA_STATE);
  818. }
  819. };
  820. //12.2.4.20 Script data escape start state
  821. //------------------------------------------------------------------
  822. _[SCRIPT_DATA_ESCAPE_START_STATE] = function scriptDataEscapeStartState(cp) {
  823. if (cp === $.HYPHEN_MINUS) {
  824. this.state = SCRIPT_DATA_ESCAPE_START_DASH_STATE;
  825. this._emitChar('-');
  826. }
  827. else
  828. this._reconsumeInState(SCRIPT_DATA_STATE);
  829. };
  830. //12.2.4.21 Script data escape start dash state
  831. //------------------------------------------------------------------
  832. _[SCRIPT_DATA_ESCAPE_START_DASH_STATE] = function scriptDataEscapeStartDashState(cp) {
  833. if (cp === $.HYPHEN_MINUS) {
  834. this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
  835. this._emitChar('-');
  836. }
  837. else
  838. this._reconsumeInState(SCRIPT_DATA_STATE);
  839. };
  840. //12.2.4.22 Script data escaped state
  841. //------------------------------------------------------------------
  842. _[SCRIPT_DATA_ESCAPED_STATE] = function scriptDataEscapedState(cp) {
  843. if (cp === $.HYPHEN_MINUS) {
  844. this.state = SCRIPT_DATA_ESCAPED_DASH_STATE;
  845. this._emitChar('-');
  846. }
  847. else if (cp === $.LESS_THAN_SIGN)
  848. this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
  849. else if (cp === $.NULL)
  850. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  851. else if (cp === $.EOF)
  852. this._reconsumeInState(DATA_STATE);
  853. else
  854. this._emitCodePoint(cp);
  855. };
  856. //12.2.4.23 Script data escaped dash state
  857. //------------------------------------------------------------------
  858. _[SCRIPT_DATA_ESCAPED_DASH_STATE] = function scriptDataEscapedDashState(cp) {
  859. if (cp === $.HYPHEN_MINUS) {
  860. this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
  861. this._emitChar('-');
  862. }
  863. else if (cp === $.LESS_THAN_SIGN)
  864. this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
  865. else if (cp === $.NULL) {
  866. this.state = SCRIPT_DATA_ESCAPED_STATE;
  867. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  868. }
  869. else if (cp === $.EOF)
  870. this._reconsumeInState(DATA_STATE);
  871. else {
  872. this.state = SCRIPT_DATA_ESCAPED_STATE;
  873. this._emitCodePoint(cp);
  874. }
  875. };
  876. //12.2.4.24 Script data escaped dash dash state
  877. //------------------------------------------------------------------
  878. _[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE] = function scriptDataEscapedDashDashState(cp) {
  879. if (cp === $.HYPHEN_MINUS)
  880. this._emitChar('-');
  881. else if (cp === $.LESS_THAN_SIGN)
  882. this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
  883. else if (cp === $.GREATER_THAN_SIGN) {
  884. this.state = SCRIPT_DATA_STATE;
  885. this._emitChar('>');
  886. }
  887. else if (cp === $.NULL) {
  888. this.state = SCRIPT_DATA_ESCAPED_STATE;
  889. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  890. }
  891. else if (cp === $.EOF)
  892. this._reconsumeInState(DATA_STATE);
  893. else {
  894. this.state = SCRIPT_DATA_ESCAPED_STATE;
  895. this._emitCodePoint(cp);
  896. }
  897. };
  898. //12.2.4.25 Script data escaped less-than sign state
  899. //------------------------------------------------------------------
  900. _[SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE] = function scriptDataEscapedLessThanSignState(cp) {
  901. if (cp === $.SOLIDUS) {
  902. this.tempBuff = [];
  903. this.state = SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE;
  904. }
  905. else if (isAsciiLetter(cp)) {
  906. this.tempBuff = [];
  907. this._emitChar('<');
  908. this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE);
  909. }
  910. else {
  911. this._emitChar('<');
  912. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  913. }
  914. };
  915. //12.2.4.26 Script data escaped end tag open state
  916. //------------------------------------------------------------------
  917. _[SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE] = function scriptDataEscapedEndTagOpenState(cp) {
  918. if (isAsciiLetter(cp)) {
  919. this._createEndTagToken();
  920. this._reconsumeInState(SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE);
  921. }
  922. else {
  923. this._emitChar('<');
  924. this._emitChar('/');
  925. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  926. }
  927. };
  928. //12.2.4.27 Script data escaped end tag name state
  929. //------------------------------------------------------------------
  930. _[SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE] = function scriptDataEscapedEndTagNameState(cp) {
  931. if (isAsciiUpper(cp)) {
  932. this.currentToken.tagName += toAsciiLowerChar(cp);
  933. this.tempBuff.push(cp);
  934. }
  935. else if (isAsciiLower(cp)) {
  936. this.currentToken.tagName += toChar(cp);
  937. this.tempBuff.push(cp);
  938. }
  939. else {
  940. if (this._isAppropriateEndTagToken()) {
  941. if (isWhitespace(cp)) {
  942. this.state = BEFORE_ATTRIBUTE_NAME_STATE;
  943. return;
  944. }
  945. if (cp === $.SOLIDUS) {
  946. this.state = SELF_CLOSING_START_TAG_STATE;
  947. return;
  948. }
  949. if (cp === $.GREATER_THAN_SIGN) {
  950. this._emitCurrentToken();
  951. this.state = DATA_STATE;
  952. return;
  953. }
  954. }
  955. this._emitChar('<');
  956. this._emitChar('/');
  957. this._emitSeveralCodePoints(this.tempBuff);
  958. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  959. }
  960. };
  961. //12.2.4.28 Script data double escape start state
  962. //------------------------------------------------------------------
  963. _[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE] = function scriptDataDoubleEscapeStartState(cp) {
  964. if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
  965. this.state = this.isTempBufferEqualToScriptString() ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE : SCRIPT_DATA_ESCAPED_STATE;
  966. this._emitCodePoint(cp);
  967. }
  968. else if (isAsciiUpper(cp)) {
  969. this.tempBuff.push(toAsciiLowerCodePoint(cp));
  970. this._emitCodePoint(cp);
  971. }
  972. else if (isAsciiLower(cp)) {
  973. this.tempBuff.push(cp);
  974. this._emitCodePoint(cp);
  975. }
  976. else
  977. this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
  978. };
  979. //12.2.4.29 Script data double escaped state
  980. //------------------------------------------------------------------
  981. _[SCRIPT_DATA_DOUBLE_ESCAPED_STATE] = function scriptDataDoubleEscapedState(cp) {
  982. if (cp === $.HYPHEN_MINUS) {
  983. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE;
  984. this._emitChar('-');
  985. }
  986. else if (cp === $.LESS_THAN_SIGN) {
  987. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
  988. this._emitChar('<');
  989. }
  990. else if (cp === $.NULL)
  991. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  992. else if (cp === $.EOF)
  993. this._reconsumeInState(DATA_STATE);
  994. else
  995. this._emitCodePoint(cp);
  996. };
  997. //12.2.4.30 Script data double escaped dash state
  998. //------------------------------------------------------------------
  999. _[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE] = function scriptDataDoubleEscapedDashState(cp) {
  1000. if (cp === $.HYPHEN_MINUS) {
  1001. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE;
  1002. this._emitChar('-');
  1003. }
  1004. else if (cp === $.LESS_THAN_SIGN) {
  1005. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
  1006. this._emitChar('<');
  1007. }
  1008. else if (cp === $.NULL) {
  1009. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1010. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  1011. }
  1012. else if (cp === $.EOF)
  1013. this._reconsumeInState(DATA_STATE);
  1014. else {
  1015. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1016. this._emitCodePoint(cp);
  1017. }
  1018. };
  1019. //12.2.4.31 Script data double escaped dash dash state
  1020. //------------------------------------------------------------------
  1021. _[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE] = function scriptDataDoubleEscapedDashDashState(cp) {
  1022. if (cp === $.HYPHEN_MINUS)
  1023. this._emitChar('-');
  1024. else if (cp === $.LESS_THAN_SIGN) {
  1025. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
  1026. this._emitChar('<');
  1027. }
  1028. else if (cp === $.GREATER_THAN_SIGN) {
  1029. this.state = SCRIPT_DATA_STATE;
  1030. this._emitChar('>');
  1031. }
  1032. else if (cp === $.NULL) {
  1033. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1034. this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
  1035. }
  1036. else if (cp === $.EOF)
  1037. this._reconsumeInState(DATA_STATE);
  1038. else {
  1039. this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1040. this._emitCodePoint(cp);
  1041. }
  1042. };
  1043. //12.2.4.32 Script data double escaped less-than sign state
  1044. //------------------------------------------------------------------
  1045. _[SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE] = function scriptDataDoubleEscapedLessThanSignState(cp) {
  1046. if (cp === $.SOLIDUS) {
  1047. this.tempBuff = [];
  1048. this.state = SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE;
  1049. this._emitChar('/');
  1050. }
  1051. else
  1052. this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
  1053. };
  1054. //12.2.4.33 Script data double escape end state
  1055. //------------------------------------------------------------------
  1056. _[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE] = function scriptDataDoubleEscapeEndState(cp) {
  1057. if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
  1058. this.state = this.isTempBufferEqualToScriptString() ? SCRIPT_DATA_ESCAPED_STATE : SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
  1059. this._emitCodePoint(cp);
  1060. }
  1061. else if (isAsciiUpper(cp)) {
  1062. this.tempBuff.push(toAsciiLowerCodePoint(cp));
  1063. this._emitCodePoint(cp);
  1064. }
  1065. else if (isAsciiLower(cp)) {
  1066. this.tempBuff.push(cp);
  1067. this._emitCodePoint(cp);
  1068. }
  1069. else
  1070. this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
  1071. };
  1072. //12.2.4.34 Before attribute name state
  1073. //------------------------------------------------------------------
  1074. _[BEFORE_ATTRIBUTE_NAME_STATE] = function beforeAttributeNameState(cp) {
  1075. if (isWhitespace(cp))
  1076. return;
  1077. if (cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF)
  1078. this._reconsumeInState(AFTER_ATTRIBUTE_NAME_STATE);
  1079. else if (cp === $.EQUALS_SIGN) {
  1080. this._createAttr('=');
  1081. this.state = ATTRIBUTE_NAME_STATE;
  1082. }
  1083. else {
  1084. this._createAttr('');
  1085. this._reconsumeInState(ATTRIBUTE_NAME_STATE);
  1086. }
  1087. };
  1088. //12.2.4.35 Attribute name state
  1089. //------------------------------------------------------------------
  1090. _[ATTRIBUTE_NAME_STATE] = function attributeNameState(cp) {
  1091. if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN || cp === $.EOF) {
  1092. this._leaveAttrName(AFTER_ATTRIBUTE_NAME_STATE);
  1093. this._unconsume();
  1094. }
  1095. else if (cp === $.EQUALS_SIGN)
  1096. this._leaveAttrName(BEFORE_ATTRIBUTE_VALUE_STATE);
  1097. else if (isAsciiUpper(cp))
  1098. this.currentAttr.name += toAsciiLowerChar(cp);
  1099. else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN)
  1100. this.currentAttr.name += toChar(cp);
  1101. else if (cp === $.NULL)
  1102. this.currentAttr.name += UNICODE.REPLACEMENT_CHARACTER;
  1103. else
  1104. this.currentAttr.name += toChar(cp);
  1105. };
  1106. //12.2.4.36 After attribute name state
  1107. //------------------------------------------------------------------
  1108. _[AFTER_ATTRIBUTE_NAME_STATE] = function afterAttributeNameState(cp) {
  1109. if (isWhitespace(cp))
  1110. return;
  1111. if (cp === $.SOLIDUS)
  1112. this.state = SELF_CLOSING_START_TAG_STATE;
  1113. else if (cp === $.EQUALS_SIGN)
  1114. this.state = BEFORE_ATTRIBUTE_VALUE_STATE;
  1115. else if (cp === $.GREATER_THAN_SIGN) {
  1116. this.state = DATA_STATE;
  1117. this._emitCurrentToken();
  1118. }
  1119. else if (cp === $.EOF)
  1120. this._reconsumeInState(DATA_STATE);
  1121. else {
  1122. this._createAttr('');
  1123. this._reconsumeInState(ATTRIBUTE_NAME_STATE);
  1124. }
  1125. };
  1126. //12.2.4.37 Before attribute value state
  1127. //------------------------------------------------------------------
  1128. _[BEFORE_ATTRIBUTE_VALUE_STATE] = function beforeAttributeValueState(cp) {
  1129. if (isWhitespace(cp))
  1130. return;
  1131. if (cp === $.QUOTATION_MARK)
  1132. this.state = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
  1133. else if (cp === $.APOSTROPHE)
  1134. this.state = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
  1135. else
  1136. this._reconsumeInState(ATTRIBUTE_VALUE_UNQUOTED_STATE);
  1137. };
  1138. //12.2.4.38 Attribute value (double-quoted) state
  1139. //------------------------------------------------------------------
  1140. _[ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE] = function attributeValueDoubleQuotedState(cp) {
  1141. if (cp === $.QUOTATION_MARK)
  1142. this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
  1143. else if (cp === $.AMPERSAND) {
  1144. this.additionalAllowedCp = $.QUOTATION_MARK;
  1145. this.returnState = this.state;
  1146. this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE;
  1147. }
  1148. else if (cp === $.NULL)
  1149. this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
  1150. else if (cp === $.EOF)
  1151. this._reconsumeInState(DATA_STATE);
  1152. else
  1153. this.currentAttr.value += toChar(cp);
  1154. };
  1155. //12.2.4.39 Attribute value (single-quoted) state
  1156. //------------------------------------------------------------------
  1157. _[ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE] = function attributeValueSingleQuotedState(cp) {
  1158. if (cp === $.APOSTROPHE)
  1159. this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
  1160. else if (cp === $.AMPERSAND) {
  1161. this.additionalAllowedCp = $.APOSTROPHE;
  1162. this.returnState = this.state;
  1163. this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE;
  1164. }
  1165. else if (cp === $.NULL)
  1166. this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
  1167. else if (cp === $.EOF)
  1168. this._reconsumeInState(DATA_STATE);
  1169. else
  1170. this.currentAttr.value += toChar(cp);
  1171. };
  1172. //12.2.4.40 Attribute value (unquoted) state
  1173. //------------------------------------------------------------------
  1174. _[ATTRIBUTE_VALUE_UNQUOTED_STATE] = function attributeValueUnquotedState(cp) {
  1175. if (isWhitespace(cp))
  1176. this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE);
  1177. else if (cp === $.AMPERSAND) {
  1178. this.additionalAllowedCp = $.GREATER_THAN_SIGN;
  1179. this.returnState = this.state;
  1180. this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE;
  1181. }
  1182. else if (cp === $.GREATER_THAN_SIGN) {
  1183. this._leaveAttrValue(DATA_STATE);
  1184. this._emitCurrentToken();
  1185. }
  1186. else if (cp === $.NULL)
  1187. this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
  1188. else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN ||
  1189. cp === $.EQUALS_SIGN || cp === $.GRAVE_ACCENT)
  1190. this.currentAttr.value += toChar(cp);
  1191. else if (cp === $.EOF)
  1192. this._reconsumeInState(DATA_STATE);
  1193. else
  1194. this.currentAttr.value += toChar(cp);
  1195. };
  1196. //12.2.4.41 Character reference in attribute value state
  1197. //------------------------------------------------------------------
  1198. _[CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE] = function characterReferenceInAttributeValueState(cp) {
  1199. var referencedCodePoints = this._consumeCharacterReference(cp, true);
  1200. if (!this._ensureHibernation()) {
  1201. if (referencedCodePoints) {
  1202. for (var i = 0; i < referencedCodePoints.length; i++)
  1203. this.currentAttr.value += toChar(referencedCodePoints[i]);
  1204. }
  1205. else
  1206. this.currentAttr.value += '&';
  1207. this.state = this.returnState;
  1208. }
  1209. };
  1210. //12.2.4.42 After attribute value (quoted) state
  1211. //------------------------------------------------------------------
  1212. _[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE] = function afterAttributeValueQuotedState(cp) {
  1213. if (isWhitespace(cp))
  1214. this._leaveAttrValue(BEFORE_ATTRIBUTE_NAME_STATE);
  1215. else if (cp === $.SOLIDUS)
  1216. this._leaveAttrValue(SELF_CLOSING_START_TAG_STATE);
  1217. else if (cp === $.GREATER_THAN_SIGN) {
  1218. this._leaveAttrValue(DATA_STATE);
  1219. this._emitCurrentToken();
  1220. }
  1221. else if (cp === $.EOF)
  1222. this._reconsumeInState(DATA_STATE);
  1223. else
  1224. this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
  1225. };
  1226. //12.2.4.43 Self-closing start tag state
  1227. //------------------------------------------------------------------
  1228. _[SELF_CLOSING_START_TAG_STATE] = function selfClosingStartTagState(cp) {
  1229. if (cp === $.GREATER_THAN_SIGN) {
  1230. this.currentToken.selfClosing = true;
  1231. this.state = DATA_STATE;
  1232. this._emitCurrentToken();
  1233. }
  1234. else if (cp === $.EOF)
  1235. this._reconsumeInState(DATA_STATE);
  1236. else
  1237. this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
  1238. };
  1239. //12.2.4.44 Bogus comment state
  1240. //------------------------------------------------------------------
  1241. _[BOGUS_COMMENT_STATE] = function bogusCommentState() {
  1242. this._createCommentToken();
  1243. this._reconsumeInState(BOGUS_COMMENT_STATE_CONTINUATION);
  1244. };
  1245. //HACK: to support streaming and make BOGUS_COMMENT_STATE reentrant we've
  1246. //introduced BOGUS_COMMENT_STATE_CONTINUATION state which will not produce
  1247. //comment token on each call.
  1248. _[BOGUS_COMMENT_STATE_CONTINUATION] = function bogusCommentStateContinuation(cp) {
  1249. while (true) {
  1250. if (cp === $.GREATER_THAN_SIGN) {
  1251. this.state = DATA_STATE;
  1252. break;
  1253. }
  1254. else if (cp === $.EOF) {
  1255. this._reconsumeInState(DATA_STATE);
  1256. break;
  1257. }
  1258. else {
  1259. this.currentToken.data += cp === $.NULL ? UNICODE.REPLACEMENT_CHARACTER : toChar(cp);
  1260. this._hibernationSnapshot();
  1261. cp = this._consume();
  1262. if (this._ensureHibernation())
  1263. return;
  1264. }
  1265. }
  1266. this._emitCurrentToken();
  1267. };
  1268. //12.2.4.45 Markup declaration open state
  1269. //------------------------------------------------------------------
  1270. _[MARKUP_DECLARATION_OPEN_STATE] = function markupDeclarationOpenState(cp) {
  1271. var dashDashMatch = this._consumeSubsequentIfMatch($$.DASH_DASH_STRING, cp, true),
  1272. doctypeMatch = !dashDashMatch && this._consumeSubsequentIfMatch($$.DOCTYPE_STRING, cp, false),
  1273. cdataMatch = !dashDashMatch && !doctypeMatch &&
  1274. this.allowCDATA &&
  1275. this._consumeSubsequentIfMatch($$.CDATA_START_STRING, cp, true);
  1276. if (!this._ensureHibernation()) {
  1277. if (dashDashMatch) {
  1278. this._createCommentToken();
  1279. this.state = COMMENT_START_STATE;
  1280. }
  1281. else if (doctypeMatch)
  1282. this.state = DOCTYPE_STATE;
  1283. else if (cdataMatch)
  1284. this.state = CDATA_SECTION_STATE;
  1285. else
  1286. this._reconsumeInState(BOGUS_COMMENT_STATE);
  1287. }
  1288. };
  1289. //12.2.4.46 Comment start state
  1290. //------------------------------------------------------------------
  1291. _[COMMENT_START_STATE] = function commentStartState(cp) {
  1292. if (cp === $.HYPHEN_MINUS)
  1293. this.state = COMMENT_START_DASH_STATE;
  1294. else if (cp === $.NULL) {
  1295. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1296. this.state = COMMENT_STATE;
  1297. }
  1298. else if (cp === $.GREATER_THAN_SIGN) {
  1299. this.state = DATA_STATE;
  1300. this._emitCurrentToken();
  1301. }
  1302. else if (cp === $.EOF) {
  1303. this._emitCurrentToken();
  1304. this._reconsumeInState(DATA_STATE);
  1305. }
  1306. else {
  1307. this.currentToken.data += toChar(cp);
  1308. this.state = COMMENT_STATE;
  1309. }
  1310. };
  1311. //12.2.4.47 Comment start dash state
  1312. //------------------------------------------------------------------
  1313. _[COMMENT_START_DASH_STATE] = function commentStartDashState(cp) {
  1314. if (cp === $.HYPHEN_MINUS)
  1315. this.state = COMMENT_END_STATE;
  1316. else if (cp === $.NULL) {
  1317. this.currentToken.data += '-';
  1318. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1319. this.state = COMMENT_STATE;
  1320. }
  1321. else if (cp === $.GREATER_THAN_SIGN) {
  1322. this.state = DATA_STATE;
  1323. this._emitCurrentToken();
  1324. }
  1325. else if (cp === $.EOF) {
  1326. this._emitCurrentToken();
  1327. this._reconsumeInState(DATA_STATE);
  1328. }
  1329. else {
  1330. this.currentToken.data += '-';
  1331. this.currentToken.data += toChar(cp);
  1332. this.state = COMMENT_STATE;
  1333. }
  1334. };
  1335. //12.2.4.48 Comment state
  1336. //------------------------------------------------------------------
  1337. _[COMMENT_STATE] = function commentState(cp) {
  1338. if (cp === $.HYPHEN_MINUS)
  1339. this.state = COMMENT_END_DASH_STATE;
  1340. else if (cp === $.NULL)
  1341. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1342. else if (cp === $.EOF) {
  1343. this._emitCurrentToken();
  1344. this._reconsumeInState(DATA_STATE);
  1345. }
  1346. else
  1347. this.currentToken.data += toChar(cp);
  1348. };
  1349. //12.2.4.49 Comment end dash state
  1350. //------------------------------------------------------------------
  1351. _[COMMENT_END_DASH_STATE] = function commentEndDashState(cp) {
  1352. if (cp === $.HYPHEN_MINUS)
  1353. this.state = COMMENT_END_STATE;
  1354. else if (cp === $.NULL) {
  1355. this.currentToken.data += '-';
  1356. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1357. this.state = COMMENT_STATE;
  1358. }
  1359. else if (cp === $.EOF) {
  1360. this._emitCurrentToken();
  1361. this._reconsumeInState(DATA_STATE);
  1362. }
  1363. else {
  1364. this.currentToken.data += '-';
  1365. this.currentToken.data += toChar(cp);
  1366. this.state = COMMENT_STATE;
  1367. }
  1368. };
  1369. //12.2.4.50 Comment end state
  1370. //------------------------------------------------------------------
  1371. _[COMMENT_END_STATE] = function commentEndState(cp) {
  1372. if (cp === $.GREATER_THAN_SIGN) {
  1373. this.state = DATA_STATE;
  1374. this._emitCurrentToken();
  1375. }
  1376. else if (cp === $.EXCLAMATION_MARK)
  1377. this.state = COMMENT_END_BANG_STATE;
  1378. else if (cp === $.HYPHEN_MINUS)
  1379. this.currentToken.data += '-';
  1380. else if (cp === $.NULL) {
  1381. this.currentToken.data += '--';
  1382. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1383. this.state = COMMENT_STATE;
  1384. }
  1385. else if (cp === $.EOF) {
  1386. this._reconsumeInState(DATA_STATE);
  1387. this._emitCurrentToken();
  1388. }
  1389. else {
  1390. this.currentToken.data += '--';
  1391. this.currentToken.data += toChar(cp);
  1392. this.state = COMMENT_STATE;
  1393. }
  1394. };
  1395. //12.2.4.51 Comment end bang state
  1396. //------------------------------------------------------------------
  1397. _[COMMENT_END_BANG_STATE] = function commentEndBangState(cp) {
  1398. if (cp === $.HYPHEN_MINUS) {
  1399. this.currentToken.data += '--!';
  1400. this.state = COMMENT_END_DASH_STATE;
  1401. }
  1402. else if (cp === $.GREATER_THAN_SIGN) {
  1403. this.state = DATA_STATE;
  1404. this._emitCurrentToken();
  1405. }
  1406. else if (cp === $.NULL) {
  1407. this.currentToken.data += '--!';
  1408. this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
  1409. this.state = COMMENT_STATE;
  1410. }
  1411. else if (cp === $.EOF) {
  1412. this._emitCurrentToken();
  1413. this._reconsumeInState(DATA_STATE);
  1414. }
  1415. else {
  1416. this.currentToken.data += '--!';
  1417. this.currentToken.data += toChar(cp);
  1418. this.state = COMMENT_STATE;
  1419. }
  1420. };
  1421. //12.2.4.52 DOCTYPE state
  1422. //------------------------------------------------------------------
  1423. _[DOCTYPE_STATE] = function doctypeState(cp) {
  1424. if (isWhitespace(cp))
  1425. return;
  1426. else if (cp === $.GREATER_THAN_SIGN) {
  1427. this._createDoctypeToken(null);
  1428. this.currentToken.forceQuirks = true;
  1429. this._emitCurrentToken();
  1430. this.state = DATA_STATE;
  1431. }
  1432. else if (cp === $.EOF) {
  1433. this._createDoctypeToken(null);
  1434. this.currentToken.forceQuirks = true;
  1435. this._emitCurrentToken();
  1436. this._reconsumeInState(DATA_STATE);
  1437. }
  1438. else {
  1439. this._createDoctypeToken('');
  1440. this._reconsumeInState(DOCTYPE_NAME_STATE);
  1441. }
  1442. };
  1443. //12.2.4.54 DOCTYPE name state
  1444. //------------------------------------------------------------------
  1445. _[DOCTYPE_NAME_STATE] = function doctypeNameState(cp) {
  1446. if (isWhitespace(cp) || cp === $.GREATER_THAN_SIGN || cp === $.EOF)
  1447. this._reconsumeInState(AFTER_DOCTYPE_NAME_STATE);
  1448. else if (isAsciiUpper(cp))
  1449. this.currentToken.name += toAsciiLowerChar(cp);
  1450. else if (cp === $.NULL)
  1451. this.currentToken.name += UNICODE.REPLACEMENT_CHARACTER;
  1452. else
  1453. this.currentToken.name += toChar(cp);
  1454. };
  1455. //12.2.4.55 After DOCTYPE name state
  1456. //------------------------------------------------------------------
  1457. _[AFTER_DOCTYPE_NAME_STATE] = function afterDoctypeNameState(cp) {
  1458. if (isWhitespace(cp))
  1459. return;
  1460. if (cp === $.GREATER_THAN_SIGN) {
  1461. this.state = DATA_STATE;
  1462. this._emitCurrentToken();
  1463. }
  1464. else {
  1465. var publicMatch = this._consumeSubsequentIfMatch($$.PUBLIC_STRING, cp, false),
  1466. systemMatch = !publicMatch && this._consumeSubsequentIfMatch($$.SYSTEM_STRING, cp, false);
  1467. if (!this._ensureHibernation()) {
  1468. if (publicMatch)
  1469. this.state = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
  1470. else if (systemMatch)
  1471. this.state = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
  1472. else {
  1473. this.currentToken.forceQuirks = true;
  1474. this.state = BOGUS_DOCTYPE_STATE;
  1475. }
  1476. }
  1477. }
  1478. };
  1479. //12.2.4.57 Before DOCTYPE public identifier state
  1480. //------------------------------------------------------------------
  1481. _[BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE] = function beforeDoctypePublicIdentifierState(cp) {
  1482. if (isWhitespace(cp))
  1483. return;
  1484. if (cp === $.QUOTATION_MARK) {
  1485. this.currentToken.publicId = '';
  1486. this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1487. }
  1488. else if (cp === $.APOSTROPHE) {
  1489. this.currentToken.publicId = '';
  1490. this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
  1491. }
  1492. else {
  1493. this.currentToken.forceQuirks = true;
  1494. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1495. }
  1496. };
  1497. //12.2.4.58 DOCTYPE public identifier (double-quoted) state
  1498. //------------------------------------------------------------------
  1499. _[DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE] = function doctypePublicIdentifierDoubleQuotedState(cp) {
  1500. if (cp === $.QUOTATION_MARK)
  1501. this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
  1502. else if (cp === $.NULL)
  1503. this.currentToken.publicId += UNICODE.REPLACEMENT_CHARACTER;
  1504. else if (cp === $.GREATER_THAN_SIGN) {
  1505. this.currentToken.forceQuirks = true;
  1506. this._emitCurrentToken();
  1507. this.state = DATA_STATE;
  1508. }
  1509. else if (cp === $.EOF) {
  1510. this.currentToken.forceQuirks = true;
  1511. this._emitCurrentToken();
  1512. this._reconsumeInState(DATA_STATE);
  1513. }
  1514. else
  1515. this.currentToken.publicId += toChar(cp);
  1516. };
  1517. //12.2.4.59 DOCTYPE public identifier (single-quoted) state
  1518. //------------------------------------------------------------------
  1519. _[DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE] = function doctypePublicIdentifierSingleQuotedState(cp) {
  1520. if (cp === $.APOSTROPHE)
  1521. this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
  1522. else if (cp === $.NULL)
  1523. this.currentToken.publicId += UNICODE.REPLACEMENT_CHARACTER;
  1524. else if (cp === $.GREATER_THAN_SIGN) {
  1525. this.currentToken.forceQuirks = true;
  1526. this._emitCurrentToken();
  1527. this.state = DATA_STATE;
  1528. }
  1529. else if (cp === $.EOF) {
  1530. this.currentToken.forceQuirks = true;
  1531. this._emitCurrentToken();
  1532. this._reconsumeInState(DATA_STATE);
  1533. }
  1534. else
  1535. this.currentToken.publicId += toChar(cp);
  1536. };
  1537. //12.2.4.61 Between DOCTYPE public and system identifiers state
  1538. //------------------------------------------------------------------
  1539. _[BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE] = function betweenDoctypePublicAndSystemIdentifiersState(cp) {
  1540. if (isWhitespace(cp))
  1541. return;
  1542. if (cp === $.GREATER_THAN_SIGN) {
  1543. this._emitCurrentToken();
  1544. this.state = DATA_STATE;
  1545. }
  1546. else if (cp === $.QUOTATION_MARK) {
  1547. this.currentToken.systemId = '';
  1548. this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1549. }
  1550. else if (cp === $.APOSTROPHE) {
  1551. this.currentToken.systemId = '';
  1552. this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
  1553. }
  1554. else {
  1555. this.currentToken.forceQuirks = true;
  1556. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1557. }
  1558. };
  1559. //12.2.4.63 Before DOCTYPE system identifier state
  1560. //------------------------------------------------------------------
  1561. _[BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE] = function beforeDoctypeSystemIdentifierState(cp) {
  1562. if (isWhitespace(cp))
  1563. return;
  1564. if (cp === $.QUOTATION_MARK) {
  1565. this.currentToken.systemId = '';
  1566. this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
  1567. }
  1568. else if (cp === $.APOSTROPHE) {
  1569. this.currentToken.systemId = '';
  1570. this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
  1571. }
  1572. else {
  1573. this.currentToken.forceQuirks = true;
  1574. this._reconsumeInState(BOGUS_DOCTYPE_STATE);
  1575. }
  1576. };
  1577. //12.2.4.64 DOCTYPE system identifier (double-quoted) state
  1578. //------------------------------------------------------------------
  1579. _[DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE] = function doctypeSystemIdentifierDoubleQuotedState(cp) {
  1580. if (cp === $.QUOTATION_MARK)
  1581. this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
  1582. else if (cp === $.GREATER_THAN_SIGN) {
  1583. this.currentToken.forceQuirks = true;
  1584. this._emitCurrentToken();
  1585. this.state = DATA_STATE;
  1586. }
  1587. else if (cp === $.NULL)
  1588. this.currentToken.systemId += UNICODE.REPLACEMENT_CHARACTER;
  1589. else if (cp === $.EOF) {
  1590. this.currentToken.forceQuirks = true;
  1591. this._emitCurrentToken();
  1592. this._reconsumeInState(DATA_STATE);
  1593. }
  1594. else
  1595. this.currentToken.systemId += toChar(cp);
  1596. };
  1597. //12.2.4.65 DOCTYPE system identifier (single-quoted) state
  1598. //------------------------------------------------------------------
  1599. _[DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE] = function doctypeSystemIdentifierSingleQuotedState(cp) {
  1600. if (cp === $.APOSTROPHE)
  1601. this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
  1602. else if (cp === $.GREATER_THAN_SIGN) {
  1603. this.currentToken.forceQuirks = true;
  1604. this._emitCurrentToken();
  1605. this.state = DATA_STATE;
  1606. }
  1607. else if (cp === $.NULL)
  1608. this.currentToken.systemId += UNICODE.REPLACEMENT_CHARACTER;
  1609. else if (cp === $.EOF) {
  1610. this.currentToken.forceQuirks = true;
  1611. this._emitCurrentToken();
  1612. this._reconsumeInState(DATA_STATE);
  1613. }
  1614. else
  1615. this.currentToken.systemId += toChar(cp);
  1616. };
  1617. //12.2.4.66 After DOCTYPE system identifier state
  1618. //------------------------------------------------------------------
  1619. _[AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE] = function afterDoctypeSystemIdentifierState(cp) {
  1620. if (isWhitespace(cp))
  1621. return;
  1622. if (cp === $.GREATER_THAN_SIGN) {
  1623. this._emitCurrentToken();
  1624. this.state = DATA_STATE;
  1625. }
  1626. else if (cp === $.EOF) {
  1627. this.currentToken.forceQuirks = true;
  1628. this._emitCurrentToken();
  1629. this._reconsumeInState(DATA_STATE);
  1630. }
  1631. else
  1632. this.state = BOGUS_DOCTYPE_STATE;
  1633. };
  1634. //12.2.4.67 Bogus DOCTYPE state
  1635. //------------------------------------------------------------------
  1636. _[BOGUS_DOCTYPE_STATE] = function bogusDoctypeState(cp) {
  1637. if (cp === $.GREATER_THAN_SIGN) {
  1638. this._emitCurrentToken();
  1639. this.state = DATA_STATE;
  1640. }
  1641. else if (cp === $.EOF) {
  1642. this._emitCurrentToken();
  1643. this._reconsumeInState(DATA_STATE);
  1644. }
  1645. };
  1646. //12.2.4.68 CDATA section state
  1647. //------------------------------------------------------------------
  1648. _[CDATA_SECTION_STATE] = function cdataSectionState(cp) {
  1649. while (true) {
  1650. if (cp === $.EOF) {
  1651. this._reconsumeInState(DATA_STATE);
  1652. break;
  1653. }
  1654. else {
  1655. var cdataEndMatch = this._consumeSubsequentIfMatch($$.CDATA_END_STRING, cp, true);
  1656. if (this._ensureHibernation())
  1657. break;
  1658. if (cdataEndMatch) {
  1659. this.state = DATA_STATE;
  1660. break;
  1661. }
  1662. this._emitCodePoint(cp);
  1663. this._hibernationSnapshot();
  1664. cp = this._consume();
  1665. if (this._ensureHibernation())
  1666. break;
  1667. }
  1668. }
  1669. };