1 module mood.parser;
2 
3 import std.stdio;
4 import std..string;
5 import mood.node;
6 
7 /**
8  * Takes raw html data and tokenizes it.
9  *
10  * Does not completely parse the html, but is used to make parsing much easier, and to seperate it into different parts.
11  *
12  * Params:
13  *  dhtml = The html that is to be tokenized.
14  * Returns: Tokenized html in the form of a string array.
15 */
16 string[] tokenizeDHTML(const string dhtml)
17 {
18     string[] tokens = [""];
19     bool inCode = false;
20     ubyte skipper = 0;
21     string codeType = "";
22     foreach(i, c; dhtml)
23     {
24         if (skipper > 0)
25         {
26             skipper--;
27             continue;
28         }
29         if (c == '>')
30         {
31             if (inCode)
32             {
33                 if (i >= 2 && i + 1 <= dhtml.length && dhtml[i-2..i+1] == "/?>" && codeType == "D")
34                 {
35                     tokens[$-1] ~= c;
36                     tokens ~= "";
37                     inCode = false;
38                     codeType = "";
39                 }
40                 else if (i >= 8 && i + 1 <= dhtml.length && dhtml[i-8..i+1] == "</script>" && codeType == "script")
41                 {
42                     tokens[$-1] ~= c;
43                     tokens ~= "";
44                     inCode = false;
45                     codeType = "";
46                 }
47                 else if (i >= 7 && i + 1 <= dhtml.length && dhtml[i-7..i+1] == "</style>" && codeType == "style")
48                 {
49                     tokens[$-1] ~= c;
50                     tokens ~= "";
51                     inCode = false;
52                     codeType = "";
53                 }
54                 else
55                     tokens[$-1] ~= c;
56             }
57             else
58             {
59                 tokens[$-1] ~= c;
60                 tokens ~= "";
61             }
62         }
63         else if (c == '<' && tokens.length >= 0 && !inCode)
64         {
65             if (i + 3 <= dhtml.length && dhtml[i..i+3] == "<?D" && codeType == "")
66             {
67                 inCode = true;
68                 codeType = "D";
69             }
70             if (i + 8 <= dhtml.length && dhtml[i..i+8] == "<script>" && codeType == "")
71             {
72                 inCode = true;
73                 codeType = "script";
74             }
75             if (i + 7 <= dhtml.length && dhtml[i..i+7] == "<style>" && codeType == "")
76             {
77                 inCode = true;
78                 codeType = "style";
79             }
80             
81             tokens ~= ("" ~ c);
82         }
83         else if (i + 1 < dhtml.length && dhtml[i..i+2] == "{{" && !inCode)
84         {
85             inCode = true;
86             codeType = "insert";
87             tokens ~= ("" ~ c);
88         }
89         else if (inCode && codeType == "insert" && i + 1 < dhtml.length && dhtml[i..i+2] == "}}")
90         {
91             inCode = false;
92             codeType = "";
93             tokens[$-1] ~= cast(string)[c, dhtml[i+1]];// ~ dhtml[i+1];
94             // tokens[$-1] ~= dhtml[i+1];
95             tokens ~= "";
96             skipper = 1;
97         }
98         else
99             tokens[$-1] ~= c;
100     }
101     return tokens;
102 }
103 private string[] removeJunk(const string[] data)
104 {
105     import std..string: strip;
106     string[] sanitized;
107     foreach(tok; data)
108     {
109         if (tok.strip.length > 0)
110             sanitized ~= tok;
111     }
112     return sanitized;
113 }
114 
115 unittest
116 {
117     /* Test 1 */
118     writeln("starting parsing test 1");
119     string html = 
120     `<!DOCTYPE html>
121     <html>
122         <head>
123             <title>hello world</title>
124         </head>
125         <body>
126             <h1>Hello World!</h1>
127             <p>lorem ipsum text</p>
128         </body>
129     </html>`;
130     string[] tokens;
131     tokens = tokenizeDHTML(html).removeJunk;
132     writeln(tokens);
133     assert(tokens[0] == "<!DOCTYPE html>");
134     assert(tokens[1] == "<html>");
135     assert(tokens[2] == "<head>");
136     assert(tokens[3] == "<title>");
137     assert(tokens[4] == "hello world");
138     assert(tokens[5] == "</title>");
139     assert(tokens[6] == "</head>");
140     assert(tokens[7] == "<body>");
141     assert(tokens[8] == "<h1>");
142     assert(tokens[9] == "Hello World!");
143     assert(tokens[10] == "</h1>");
144     assert(tokens[11] == "<p>");
145     assert(tokens[12] == "lorem ipsum text");
146     assert(tokens[13] == "</p>");
147     assert(tokens[14] == "</body>");
148     assert(tokens[15] == "</html>");
149 
150     /* Test 2 */
151     writeln("starting test 2");
152     html = 
153 `<!DOCTYPE html>
154 <?D
155     import std.stdio;
156 /?>
157 <html>
158     <body>
159         <?D
160             output("Hello World");
161         /?>
162     </body>
163 </html>`;
164     tokens = tokenizeDHTML(html).removeJunk;
165     writeln(tokens);
166     assert(tokens[1] == 
167 `<?D
168     import std.stdio;
169 /?>`);
170     assert(tokens[4] == 
171         `<?D
172             output("Hello World");
173         /?>`);
174 
175 
176     /* Test 3 */
177     writeln("starting test 3");
178     html =
179 `<!DOCTYPE html>
180 <html>
181     {{ variable }}
182 </html>`;
183 
184     tokens = tokenizeDHTML(html).removeJunk;
185     writeln(tokens);
186     assert(tokens[0] == "<!DOCTYPE html>");
187     assert(tokens[1] == "<html>");
188     assert(tokens[2] == "{{ variable }}");
189     assert(tokens[3] == "</html>");
190 }
191 
192 /**
193  * Parses tokenized html data.
194  *
195  * Takes in tokenized html data and outputs a set of nodes that contain information about itself.
196  *
197  * Params:
198  *  tokens = The tokens that are output from tokenizing the html data.
199  * Returns: Parsed html nodes.
200 */
201 Node[] parseDHTML(const string[] tokens)
202 {
203     Node[] nodes;
204     Node current;
205     foreach(i, tok; tokens)
206     {
207         if (tok.length == 0)
208             continue;
209 
210         if (tok.strip.length >= 4 && tok.strip[0..4] == "<!--")
211         {
212             current.nodeType = NodeType.OpeningTag;
213             current.tagType = TagType.Comment;
214             current.original = tok;
215             nodes ~= current;
216             current = Node.init;
217             continue;
218         }
219         else if (tok.strip.length >= 3 && tok.strip[0..3] == "-->")
220         {
221             current.nodeType = NodeType.ClosingTag;
222             current.tagType = TagType.Comment;
223             current.original = tok;
224             nodes ~= current;
225             current = Node.init;
226             continue;
227         }
228 
229         current.original = tok;
230         // first, determine the NodeType
231         if (tok.length >= 2 && tok[0..2] == "</")
232             current.nodeType = NodeType.ClosingTag;
233         else if (tok[0] == '<')
234             current.nodeType = NodeType.OpeningTag;
235         else if (tok.length >= 2 && tok[0..2] == "{{")
236             current.nodeType = NodeType.OpeningTag;
237         else
238             current.nodeType = NodeType.Content;
239 
240         // second, determine the tag type
241         if (current.nodeType == NodeType.Content)
242             current.tagType = TagType.None;
243         else if (tok.length >= 3 && tok[0..3] == "<?D")
244             current.tagType = TagType.Code;
245         else if (tok.length >= 2 && tok[0..2] == "{{")
246             current.tagType = TagType.Insert;
247         else
248             current.tagType = TagType.Tag;
249         // next determine the attributes if its a tag
250         if (current.nodeType == NodeType.Content)
251         {
252             current.content = tok;
253         }
254         else if (current.tagType == TagType.Tag)
255         {
256             long idx = tok.indexOf("<");
257             string workingTag = tok[idx + 1..$-1];
258             idx = workingTag.indexOf(" ");
259             idx = idx == -1 ? workingTag.length : idx;
260             current.content = workingTag[0..idx];
261             workingTag = workingTag[idx..$];
262             current.attributes = cast(Attribute[])[];
263             if (current.nodeType == NodeType.OpeningTag)
264             {
265                 foreach(_zzz; 0..24)
266                 {
267                     workingTag = workingTag.stripLeft;
268                     if (workingTag.length == 0)
269                         break;
270                     //parse next attribute
271                     Attribute attr;
272                     long idx1 = workingTag.indexOf(" ");
273                     long idx2 = workingTag.indexOf("=");
274                     long idx3 = workingTag.indexOf("\"");
275 
276                     if (idx1 <= idx2 && idx2 + 1 != idx3)
277                     {
278                         idx1 = idx1 == -1 ? workingTag.length : idx1;
279                         attr.attribute = workingTag[0..idx1];
280                         attr.type = AttributeType.Parameter;
281                         workingTag = workingTag[idx1..$];
282                     }
283                     else if (idx2 + 1 == idx3) // String value
284                     {
285                         attr.attribute = workingTag[0..idx2];
286                         long idx4 = workingTag.indexOf("\"", idx3+1);
287                         idx4 = idx4 == -1 ? workingTag.length : idx4;
288                         attr.val = workingTag[idx3+1..idx4];
289                         attr.type = AttributeType.String;
290                         if (idx4 + 1 < workingTag.length)
291                             workingTag = workingTag[idx4+2..$];
292                         else
293                             workingTag = "";
294                     }
295                     else if (idx2 > 0)// Number Value
296                     {
297                         attr.attribute = workingTag[0..idx2];
298                         long idx4 = workingTag.indexOf(" ", idx2);
299                         idx4 = idx4 == -1 ? workingTag.length : idx4;
300                         attr.val = workingTag[idx2+1..idx4];
301                         attr.type = AttributeType.Number;
302                         if (idx4 != workingTag.length)
303                             workingTag = workingTag[idx4+1..$];
304                         else
305                             workingTag = "";
306                     }
307                     else if (idx1 == -1)
308                         workingTag = "";
309                     current.attributes ~= attr;
310                 }
311             }
312         }
313         else if (current.tagType == TagType.Code)
314         {
315             current.content = tok[3..$-3];
316         }
317         else if (current.tagType == TagType.Insert)
318         {
319             current.content = tok[2..$-2];
320         }
321 
322 
323         nodes ~= current;
324         current = Node.init;
325     }
326     return nodes;
327 }