-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpeg-html-parser.pegls
88 lines (71 loc) · 3.27 KB
/
peg-html-parser.pegls
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
{
function node name, attributes, content
{node: name}
..attrs = that if attributes
..content = content if content and content.length
function reduceToObj xs
with attr = {}
setField = (x) -> attr[x.name] = x.text if x and x.name
[setField .. for xs]
}
/**
* Document is just a collection of elements.
*/
Document = __ nodes:Element* { nodes }
/**
* Elements - https://www.w3.org/TR/html5/syntax.html#elements-0
*/
Element = RawText / Nested / Void / Comment / DocType / Text
RawText = Script / Style / Textarea / Title / PlainText
Script "script" = '<script'i attrs:Attributes '>' __ content:(ch:(!('</script'i __ '>') c:. { c })* { ch.join '' }) __ '</script'i __ '>' __ { node 'script', attrs, content }
Style "style" = '<style'i attrs:Attributes '>' __ content:(ch:(!('</style'i __ '>') c:. { c })* { ch.join '' }) __ '</style'i __ '>' __ { node 'style', attrs, content }
Textarea "textarea" = '<textarea'i attrs:Attributes '>' __ content:(ch:(!('</textarea'i __ '>') c:. { c })* { ch.join '' }) __ '</textarea'i __ '>' __ { node 'textarea', attrs, content }
Title "title" = '<title'i attrs:Attributes '>' __ content:(ch:(!('</title'i __ '>') c:. { c })* { ch.join '' }) __ '</title'i __ '>' __ { node 'title', attrs, content }
PlainText "plaintext" = '<plaintext'i attrs:Attributes '>' __ content:(ch:(!('</plaintext'i __ '>') c:. { c })* { ch.join '' }) __ '</plaintext'i __ '>' __ { node 'plaintext', attrs, content }
Nested "element" = begin:TagBegin __ content:Element* __ end:TagEnd __ &{ begin.node == end } {
node begin.node, begin.attrs, content
}
TagBegin "begin tag" = '<' name:Symbol attrs:Attributes '>' { node name.toLowerCase!, attrs }
TagEnd "end tag" = '</' name:Symbol __ '>' { name.toLowerCase! }
/**
* Void element (with self closing tag, w/o content)
* - 'area'i / 'base'i / 'br'i / 'col'i / 'embed'i / 'hr'i / 'img'i / 'input'i / 'keygen'i / 'link'i / 'meta'i / 'param'i / 'source'i / 'track'i / 'wbr'i
*/
Void "element" = '<' name:Symbol attrs:Attributes ('/>' / '>') __ { node name, attrs }
Comment "comment" = '<!--' text:CommentText '-->' __ {
node 'comment', void, text
}
CommentText = ch:(!'-->' c:. { c })* { ch.join '' }
DocType "doctype" = '<!DOCTYPE'i __ root:Symbol __ type:('public'i / 'system'i)? __ text:String* '>' __ {
node 'doctype'
..root = that.toLowerCase! if root
..type = that.toLowerCase! if type
..content = text if text and text.length
}
Text "text"
= ch:(c:[^<] { c })+ {
node 'text', void, ch.join ''
}
/ ch:(!TagEnd !Void !Comment !DocType c:. { c })+ {
node 'text', void, ch.join ''
}
/**
* Element attributes
*/
Attributes = __ attrs:Attribute* __ { reduceToObj attrs if attrs and attrs.length }
Attribute "attribute"
= name:Symbol __ text:(__ '=' __ s:String { s })? __ { {name, text} }
/ !'/>' [^> ]+ __ { void }
/**
* String - single, double, w/o quotes
*/
String "string"
= '"' ch:[^"]* '"' __ { ch.join '' }
/ '\'' ch:[^']* '\'' __ { ch.join '' }
/ ch:[^"'<>` ]+ __ { ch.join '' }
/**
* Tag name, attribute name
*/
Symbol = h:[a-zA-Z0-9_\-] t:[a-zA-Z0-9:_\-]* { h ++ t.join '' }
__ "space characters"
= [\r\n \t\u000C]*