html-serializer.js 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. /*
  2. * Copyright 2010-2020 Gildas Lormeau
  3. * contact : gildas.lormeau <at> gmail.com
  4. *
  5. * This file is part of SingleFile.
  6. *
  7. * The code in this file is free software: you can redistribute it and/or
  8. * modify it under the terms of the GNU Affero General Public License
  9. * (GNU AGPL) as published by the Free Software Foundation, either version 3
  10. * of the License, or (at your option) any later version.
  11. *
  12. * The code in this file is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
  15. * General Public License for more details.
  16. *
  17. * As additional permission under GNU AGPL version 3 section 7, you may
  18. * distribute UNMODIFIED VERSIONS OF THIS file without the copy of the GNU
  19. * AGPL normally required by section 4, provided you include this license
  20. * notice and a URL through which recipients can access the Corresponding
  21. * Source.
  22. */
  23. this.singlefile.lib.modules.serializer = this.singlefile.lib.modules.serializer || (() => {
  24. const SELF_CLOSED_TAG_NAMES = ["area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"];
  25. const Node_ELEMENT_NODE = 1;
  26. const Node_TEXT_NODE = 3;
  27. const Node_COMMENT_NODE = 8;
  28. // see https://www.w3.org/TR/html5/syntax.html#optional-tags
  29. const OMITTED_START_TAGS = [
  30. { tagName: "head", accept: element => !element.childNodes.length || element.childNodes[0].nodeType == Node_ELEMENT_NODE },
  31. { tagName: "body", accept: element => !element.childNodes.length }
  32. ];
  33. const OMITTED_END_TAGS = [
  34. { tagName: "html", accept: next => !next || next.nodeType != Node_COMMENT_NODE },
  35. { tagName: "head", accept: next => !next || (next.nodeType != Node_COMMENT_NODE && (next.nodeType != Node_TEXT_NODE || !startsWithSpaceChar(next.textContent))) },
  36. { tagName: "body", accept: next => !next || next.nodeType != Node_COMMENT_NODE },
  37. { tagName: "li", accept: (next, element) => (!next && element.parentElement && (element.parentElement.tagName == "UL" || element.parentElement.tagName == "OL")) || (next && ["LI"].includes(next.tagName)) },
  38. { tagName: "dt", accept: next => !next || ["DT", "DD"].includes(next.tagName) },
  39. { tagName: "p", accept: next => next && ["ADDRESS", "ARTICLE", "ASIDE", "BLOCKQUOTE", "DETAILS", "DIV", "DL", "FIELDSET", "FIGCAPTION", "FIGURE", "FOOTER", "FORM", "H1", "H2", "H3", "H4", "H5", "H6", "HEADER", "HR", "MAIN", "NAV", "OL", "P", "PRE", "SECTION", "TABLE", "UL"].includes(next.tagName) },
  40. { tagName: "dd", accept: next => !next || ["DT", "DD"].includes(next.tagName) },
  41. { tagName: "rt", accept: next => !next || ["RT", "RP"].includes(next.tagName) },
  42. { tagName: "rp", accept: next => !next || ["RT", "RP"].includes(next.tagName) },
  43. { tagName: "optgroup", accept: next => !next || ["OPTGROUP"].includes(next.tagName) },
  44. { tagName: "option", accept: next => !next || ["OPTION", "OPTGROUP"].includes(next.tagName) },
  45. { tagName: "colgroup", accept: next => !next || (next.nodeType != Node_COMMENT_NODE && (next.nodeType != Node_TEXT_NODE || !startsWithSpaceChar(next.textContent))) },
  46. { tagName: "caption", accept: next => !next || (next.nodeType != Node_COMMENT_NODE && (next.nodeType != Node_TEXT_NODE || !startsWithSpaceChar(next.textContent))) },
  47. { tagName: "thead", accept: next => !next || ["TBODY", "TFOOT"].includes(next.tagName) },
  48. { tagName: "tbody", accept: next => !next || ["TBODY", "TFOOT"].includes(next.tagName) },
  49. { tagName: "tfoot", accept: next => !next },
  50. { tagName: "tr", accept: next => !next || ["TR"].includes(next.tagName) },
  51. { tagName: "td", accept: next => !next || ["TD", "TH"].includes(next.tagName) },
  52. { tagName: "th", accept: next => !next || ["TD", "TH"].includes(next.tagName) }
  53. ];
  54. const TEXT_NODE_TAGS = ["style", "script", "xmp", "iframe", "noembed", "noframes", "plaintext", "noscript"];
  55. return {
  56. process
  57. };
  58. function process(doc, compressHTML) {
  59. const docType = doc.doctype;
  60. let docTypeString = "";
  61. if (docType) {
  62. docTypeString = "<!DOCTYPE " + docType.nodeName;
  63. if (docType.publicId) {
  64. docTypeString += " PUBLIC \"" + docType.publicId + "\"";
  65. if (docType.systemId)
  66. docTypeString += " \"" + docType.systemId + "\"";
  67. } else if (docType.systemId)
  68. docTypeString += " SYSTEM \"" + docType.systemId + "\"";
  69. if (docType.internalSubset)
  70. docTypeString += " [" + docType.internalSubset + "]";
  71. docTypeString += "> ";
  72. }
  73. return docTypeString + serialize(doc.documentElement, compressHTML);
  74. }
  75. function serialize(node, compressHTML, isSVG) {
  76. if (node.nodeType == Node_TEXT_NODE) {
  77. return serializeTextNode(node);
  78. } else if (node.nodeType == Node_COMMENT_NODE) {
  79. return serializeCommentNode(node);
  80. } else if (node.nodeType == Node_ELEMENT_NODE) {
  81. return serializeElement(node, compressHTML, isSVG);
  82. }
  83. }
  84. function serializeTextNode(textNode) {
  85. const parentNode = textNode.parentNode;
  86. let parentTagName;
  87. if (parentNode && parentNode.nodeType == Node_ELEMENT_NODE) {
  88. parentTagName = parentNode.tagName.toLowerCase();
  89. }
  90. if (!parentTagName || TEXT_NODE_TAGS.includes(parentTagName)) {
  91. if (parentTagName == "script") {
  92. return textNode.textContent.replace(/<\//gi, "<\\/").replace(/\/>/gi, "\\/>");
  93. }
  94. return textNode.textContent;
  95. } else {
  96. return textNode.textContent.replace(/&/g, "&amp;").replace(/\u00a0/g, "&nbsp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
  97. }
  98. }
  99. function serializeCommentNode(commentNode) {
  100. return "<!--" + commentNode.textContent + "-->";
  101. }
  102. function serializeElement(element, compressHTML, isSVG) {
  103. const tagName = element.tagName.toLowerCase();
  104. const omittedStartTag = compressHTML && OMITTED_START_TAGS.find(omittedStartTag => tagName == omittedStartTag.tagName && omittedStartTag.accept(element));
  105. let content = "";
  106. if (!omittedStartTag || element.attributes.length) {
  107. content = "<" + tagName;
  108. Array.from(element.attributes).forEach(attribute => content += serializeAttribute(attribute, element, compressHTML));
  109. content += ">";
  110. }
  111. if (element.tagName == "TEMPLATE" && !element.childNodes.length) {
  112. content += element.innerHTML;
  113. } else {
  114. Array.from(element.childNodes).forEach(childNode => content += serialize(childNode, compressHTML, isSVG || tagName == "svg"));
  115. }
  116. const omittedEndTag = compressHTML && OMITTED_END_TAGS.find(omittedEndTag => tagName == omittedEndTag.tagName && omittedEndTag.accept(element.nextSibling, element));
  117. if (isSVG || (!omittedEndTag && !SELF_CLOSED_TAG_NAMES.includes(tagName))) {
  118. content += "</" + tagName + ">";
  119. }
  120. return content;
  121. }
  122. function serializeAttribute(attribute, element, compressHTML) {
  123. const name = attribute.name;
  124. let content = "";
  125. if (!name.match(/["'>/=]/)) {
  126. let value = attribute.value;
  127. if (compressHTML && name == "class") {
  128. value = Array.from(element.classList).map(className => className.trim()).join(" ");
  129. }
  130. let simpleQuotesValue;
  131. value = value.replace(/&/g, "&amp;").replace(/\u00a0/g, "&nbsp;");
  132. if (value.includes("\"")) {
  133. if (value.includes("'") || !compressHTML) {
  134. value = value.replace(/"/g, "&quot;");
  135. } else {
  136. simpleQuotesValue = true;
  137. }
  138. }
  139. const invalidUnquotedValue = !compressHTML || !value.match(/^[^ \t\n\f\r'"`=<>]+$/);
  140. content += " ";
  141. if (!attribute.namespace) {
  142. content += name;
  143. } else if (attribute.namespaceURI == "http://www.w3.org/XML/1998/namespace") {
  144. content += "xml:" + name;
  145. } else if (attribute.namespaceURI == "http://www.w3.org/2000/xmlns/") {
  146. if (name !== "xmlns") {
  147. content += "xmlns:";
  148. }
  149. content += name;
  150. } else if (attribute.namespaceURI == "http://www.w3.org/1999/xlink") {
  151. content += "xlink:" + name;
  152. } else {
  153. content += name;
  154. }
  155. if (value != "") {
  156. content += "=";
  157. if (invalidUnquotedValue) {
  158. content += simpleQuotesValue ? "'" : "\"";
  159. }
  160. content += value;
  161. if (invalidUnquotedValue) {
  162. content += simpleQuotesValue ? "'" : "\"";
  163. }
  164. }
  165. }
  166. return content;
  167. }
  168. function startsWithSpaceChar(textContent) {
  169. return Boolean(textContent.match(/^[ \t\n\f\r]/));
  170. }
  171. })();