Refactor htmlToMd.ts (#934)

Part of #845

This is the first refactor of `htmlToMd.ts`. It brings the following
changes:

- New function named `buildMathExpression` to build math expressions
whether they are inlined or not
- New function named `removeEmphasisSpaces` to remove the initial and
trailing spaces in an emphasis node
- The Unified plugin and the creation of the handlers for it were
separated into two functions. The unified plugin was inlined into
`generateMarkdownFile`

---------

Co-authored-by: Eric Arellano <14852634+Eric-Arellano@users.noreply.github.com>
This commit is contained in:
Arnau Casau 2024-02-29 19:22:24 +01:00 committed by GitHub
parent d69ba5dc77
commit 5b0859eb4c
1 changed files with 77 additions and 75 deletions

View File

@ -22,7 +22,7 @@ import remarkMath from "remark-math";
import remarkMdx from "remark-mdx";
import { MdxJsxFlowElement } from "mdast-util-mdx-jsx";
import { visit } from "unist-util-visit";
import { Root } from "mdast";
import { Emphasis, Root } from "mdast";
import { processHtml } from "./processHtml";
import { HtmlToMdResult } from "./HtmlToMdResult";
@ -54,6 +54,39 @@ async function generateMarkdownFile(
mainHtml: string,
meta: Metadata,
): Promise<string> {
const handlers = prepareHandlers(meta);
const mdFile = await unified()
.use(rehypeParse)
.use(remarkGfm)
.use(remarkMath)
.use(remarkMdx)
.use(rehypeRemark, {
handlers,
})
.use(remarkStringify, remarkStringifyOptions)
.use(() => (root: Root) => {
// merge contiguous emphasis
visit(root, "emphasis", (node, index, parent) => {
if (index === null || parent === null) return;
let nextIndex = index + 1;
while (parent.children[nextIndex]?.type === "emphasis") {
node.children.push(
...((parent.children[nextIndex] as any).children ?? []),
);
nextIndex++;
}
parent.children.splice(index + 1, nextIndex - (index + 1));
removeEmphasisSpaces(node, index, parent, "initial");
removeEmphasisSpaces(node, index, parent, "tail");
});
})
.process(mainHtml);
return mdFile.toString().replaceAll(`<!---->`, "");
}
function prepareHandlers(meta: Metadata): Record<string, Handle> {
const handlers: Record<string, Handle> = {
br(h, node: any) {
return all(h, node);
@ -66,13 +99,7 @@ async function generateMarkdownFile(
},
span(h, node: any) {
if (node.properties.className?.includes("math")) {
let value = node.children[0].value;
const prefix = "\\(";
const sufix = "\\)";
if (value.startsWith(prefix) && value.endsWith(sufix)) {
value = value.substring(prefix.length, value.length - sufix.length);
}
return { type: "inlineMath", value };
return buildMathExpression(node, "inlineMath");
}
if (node.properties.id && node.properties.className?.includes("target")) {
@ -87,13 +114,7 @@ async function generateMarkdownFile(
},
pre(h, node: any) {
if (node.properties.className?.includes("math")) {
let value = node.children[0].value;
const prefix = "\\[";
const sufix = "\\]";
if (value.startsWith(prefix) && value.endsWith(sufix)) {
value = value.substring(prefix.length, value.length - sufix.length);
}
return { type: "math", value };
return buildMathExpression(node, "math");
}
return defaultHandlers.pre(h, node);
},
@ -149,10 +170,7 @@ async function generateMarkdownFile(
(child: any) =>
child.properties.className?.includes("versionmodified"),
);
let title = toText(titleNode).trim();
if (title.endsWith(":")) {
title = title.slice(0, -1);
}
const title = toText(titleNode).trim().replace(/:$/, "");
const otherChildren = without(root.children, titleNode);
return buildAdmonition({
title,
@ -172,64 +190,38 @@ async function generateMarkdownFile(
},
};
const mdFile = await unified()
.use(rehypeParse)
.use(remarkGfm)
.use(remarkMath)
.use(remarkMdx)
.use(rehypeRemark, {
handlers,
})
.use(remarkStringify, remarkStringifyOptions)
.use(() => (root: Root) => {
// merge contiguous emphasis
visit(root, "emphasis", (node, index, parent) => {
if (index === null || parent === null) return;
let nextIndex = index + 1;
while (parent.children[nextIndex]?.type === "emphasis") {
node.children.push(
...((parent.children[nextIndex] as any).children ?? []),
);
nextIndex++;
return handlers;
}
parent.children.splice(index + 1, nextIndex - (index + 1));
// remove initial and trailing spaces from emphasis
const firstChild = first(node.children);
if (firstChild?.type === "text") {
const match = firstChild.value.match(/^\s+/);
function removeEmphasisSpaces(
node: Emphasis,
index: number,
parent: any,
position: "initial" | "tail",
): void {
const child =
position == "initial" ? first(node.children) : last(node.children);
const reg = position == "initial" ? /^\s+/ : /\s+$/;
const idx = position == "initial" ? index : index + 1;
if (child?.type === "text") {
const match = child.value.match(reg);
if (match) {
if (match[0] === firstChild.value) {
node.children = tail(node.children);
if (match[0] === child.value) {
node.children =
position == "initial" ? tail(node.children) : initial(node.children);
} else {
firstChild.value = removePrefix(firstChild.value, match[0]);
child.value =
position == "initial"
? removePrefix(child.value, match[0])
: removeSuffix(child.value, match[0]);
}
parent.children.splice(index, 0, {
parent.children.splice(idx, 0, {
type: "text",
value: match[0],
});
}
}
const lastChild = last(node.children);
if (lastChild?.type === "text") {
const match = lastChild.value.match(/\s+$/);
if (match) {
if (match[0] === lastChild.value) {
node.children = initial(node.children);
} else {
lastChild.value = removeSuffix(lastChild.value, match[0]);
}
parent.children.splice(index + 1, 0, {
type: "text",
value: match[0],
});
}
}
});
})
.process(mainHtml);
return mdFile.toString().replaceAll(`<!---->`, "");
}
function buildAdmonition(options: {
@ -271,3 +263,13 @@ function buildSpanId(id: string): MdxJsxFlowElement {
children: [],
};
}
function buildMathExpression(node: any, type: "math" | "inlineMath"): any {
let value = node.children[0].value;
const prefix = type == "math" ? "\\[" : "\\(";
const sufix = type == "math" ? "\\]" : "\\)";
if (value.startsWith(prefix) && value.endsWith(sufix)) {
value = value.substring(prefix.length, value.length - sufix.length);
}
return { type: type, value };
}