基于 Go 实现一个 Markdown 解析器
概述
仓库链接:https://github.com/songquanpeng/md2html 。
简单说一下流程:
- 把 Markdown 字符串分解为 Token 流。
- 从 Token 流中构建出抽象语法树(AST)。
- 遍历抽象语法树,生成 HTML 文档。
词法分析部分
核心代码:
func nextToken() (textToken, otherToken Token) {
textToken.Type = TextToken
for {
if pos >= len(input) {
otherToken.Type = EofToken
return
}
c := input[pos]
if len(textToken.Value) == 0 && (lastTokenType == NewlineToken || lastTokenType == TabToken) {
switch c {
case '#':
n := countSymbol(c)
otherToken.Type = TitleToken
otherToken.Value = append(otherToken.Value, rune(n))
pos += n
if input[pos] == ' ' {
pos++
}
return
case '\t':
otherToken.Type = TabToken
pos++
return
case '\n':
otherToken.Type = NewlineToken
pos++
return
case '-':
fallthrough
case '+':
fallthrough
case '*':
if isSpaceBehind() {
otherToken.Type = UnorderedListToken
pos += 2
yes, completed := isTaskSymbol()
if yes {
pos += 2
if isSpaceBehind() {
pos += 2
if completed {
otherToken.Type = CompletedTaskToken
} else {
otherToken.Type = UncompletedTaskToken
}
return
}
pos -= 2
}
return
} else { // Consider if this is a dividing line
if nextIsSameTo(c) {
pos++
if nextIsSameTo(c) {
pos += 2
otherToken.Type = DividingLineToken
return
}
pos--
}
}
case '>':
if isSpaceBehind() {
otherToken.Type = QuoteToken
pos += 2
return
}
case '`':
if nextIsSameTo(c) {
pos++
if nextIsSameTo(c) {
pos += 2
otherToken.Type = CodeBlockToken
start, end := getCodeBlockStartEnd()
otherToken.Value = input[start:end]
pos = end + 3
return
}
pos--
}
case '\r':
fallthrough
case ' ':
n := countSymbol(c)
if n >= 2 {
pos += n
otherToken.Type = TabToken
return
} else {
pos++
}
}
if isNumDotSpace() {
otherToken.Type = OrderedListToken
return
}
}
// Update c because pos maybe updated due to black symbol.
c = input[pos]
// Now we have to return the text token before the below token.
switch c {
case '*':
if nextIsSameTo(c) {
pos += 2
otherToken.Type = DoubleStarToken
otherToken.Value = []rune("**")
} else {
pos += 1
otherToken.Type = SingleStarToken
otherToken.Value = []rune("*")
}
return
case '_':
if nextIsSameTo(c) {
pos += 2
otherToken.Type = DoubleUnderscoreToken
otherToken.Value = []rune("__")
} else {
pos += 1
otherToken.Type = SingleUnderscoreToken
otherToken.Value = []rune("_")
}
return
case '~':
if nextIsSameTo(c) {
pos += 2
otherToken.Type = DoubleTildeToken
otherToken.Value = []rune("~~")
return
}
case '`':
otherToken.Type = SingleBacktickToken
otherToken.Value = []rune("`")
pos++
return
case '!':
if nextIsSameTo('[') {
pos += 2
otherToken.Type = ImageHeadToken
return
}
case '[':
pos++
otherToken.Type = LinkHeadToken
return
case ']':
if nextIsSameTo('(') {
pos += 2
for i := pos; i < len(input) && input[i] != '\n'; i++ {
if input[i] == ')' {
otherToken.Type = LinkBodyToken
otherToken.Value = input[pos:i]
pos = i + 1
return
}
}
pos -= 2
}
case '\n':
otherToken.Type = NewlineToken
pos++
return
case '\t':
otherToken.Type = TabToken
pos++
return
}
pos++
if c != '\r' {
textToken.Value = append(textToken.Value, c)
}
}
}
语法分析部分
核心代码:
func parseSectionList() (root *Node) {
node := Node{}
root = &node
for {
token := getToken()
restoreToken()
current := &Node{}
switch token.Type {
case lexer.TitleToken:
current = parseTitle()
case lexer.DividingLineToken:
current = parseDividingLine()
case lexer.CodeBlockToken:
current = parseCodeBlock()
case lexer.UncompletedTaskToken:
fallthrough
case lexer.CompletedTaskToken:
fallthrough
case lexer.UnorderedListToken:
fallthrough
case lexer.OrderedListToken:
current = parseList()
case lexer.QuoteToken:
current = parseQuote()
case lexer.NewlineToken:
_ = getToken()
tabCounter = 0
continue
case lexer.TabToken:
tabCounter++
_ = getToken()
continue
case lexer.EofToken:
return
default:
current = parseContent(false)
}
root.Children = append(root.Children, current)
}
}
HTML 代码生成部分
这部分最好玩也最简单,主要代码:
func Convert(markdown string, fullPage bool) (html string) {
ast := parser.Parse(markdown)
if os.Getenv("MODE") == "debug" {
parser.PrintAST(ast)
}
html = processArticleNode(ast)
if fullPage {
html = fmt.Sprintf(HtmlTemplate, Style, html)
}
return html
}
func processArticleNode(node *parser.Node) (html string) {
for _, child := range node.Children {
switch child.Type {
case parser.TitleNode:
html += processTitleNode(child)
case parser.DividingLineNode:
html += processDividingLineNode(child)
case parser.ContentNode:
content := processContentNode(child)
html += fmt.Sprintf("<div>%s</div>\n", content)
case parser.ListNode:
html += processListNode(child)
case parser.QuoteNode:
html += processQuoteNode(child)
case parser.CodeBlockNode:
html += processCodeBlockNode(child)
}
}
html = fmt.Sprintf("<div class='article'>\n%s\n</div>", html)
return
}
func processTitleNode(node *parser.Node) (html string) {
content := processContentNode(node.Children[0])
level := int(node.Value[0])
html = fmt.Sprintf("<h%d>%s</h%d>\n", level, content, level)
return
}
Links: md2html