1 module mood.parser; 2 3 import std.stdio; 4 import std..string; 5 import mood.node; 6 7 /** 8 * Takes raw html data and tokenizes it. 9 * 10 * Does not completely parse the html, but is used to make parsing much easier, and to seperate it into different parts. 11 * 12 * Params: 13 * dhtml = The html that is to be tokenized. 14 * Returns: Tokenized html in the form of a string array. 15 */ 16 string[] tokenizeDHTML(const string dhtml) 17 { 18 string[] tokens = [""]; 19 bool inCode = false; 20 ubyte skipper = 0; 21 string codeType = ""; 22 foreach(i, c; dhtml) 23 { 24 if (skipper > 0) 25 { 26 skipper--; 27 continue; 28 } 29 if (c == '>') 30 { 31 if (inCode) 32 { 33 if (i >= 2 && i + 1 <= dhtml.length && dhtml[i-2..i+1] == "/?>" && codeType == "D") 34 { 35 tokens[$-1] ~= c; 36 tokens ~= ""; 37 inCode = false; 38 codeType = ""; 39 } 40 else if (i >= 8 && i + 1 <= dhtml.length && dhtml[i-8..i+1] == "</script>" && codeType == "script") 41 { 42 tokens[$-1] ~= c; 43 tokens ~= ""; 44 inCode = false; 45 codeType = ""; 46 } 47 else if (i >= 7 && i + 1 <= dhtml.length && dhtml[i-7..i+1] == "</style>" && codeType == "style") 48 { 49 tokens[$-1] ~= c; 50 tokens ~= ""; 51 inCode = false; 52 codeType = ""; 53 } 54 else 55 tokens[$-1] ~= c; 56 } 57 else 58 { 59 tokens[$-1] ~= c; 60 tokens ~= ""; 61 } 62 } 63 else if (c == '<' && tokens.length >= 0 && !inCode) 64 { 65 if (i + 3 <= dhtml.length && dhtml[i..i+3] == "<?D" && codeType == "") 66 { 67 inCode = true; 68 codeType = "D"; 69 } 70 if (i + 8 <= dhtml.length && dhtml[i..i+8] == "<script>" && codeType == "") 71 { 72 inCode = true; 73 codeType = "script"; 74 } 75 if (i + 7 <= dhtml.length && dhtml[i..i+7] == "<style>" && codeType == "") 76 { 77 inCode = true; 78 codeType = "style"; 79 } 80 81 tokens ~= ("" ~ c); 82 } 83 else if (i + 1 < dhtml.length && dhtml[i..i+2] == "{{" && !inCode) 84 { 85 inCode = true; 86 codeType = "insert"; 87 tokens ~= ("" ~ c); 88 } 89 else if (inCode && codeType == "insert" && i + 1 < dhtml.length && dhtml[i..i+2] == "}}") 90 { 91 inCode = false; 92 codeType = ""; 93 tokens[$-1] ~= cast(string)[c, dhtml[i+1]];// ~ dhtml[i+1]; 94 // tokens[$-1] ~= dhtml[i+1]; 95 tokens ~= ""; 96 skipper = 1; 97 } 98 else 99 tokens[$-1] ~= c; 100 } 101 return tokens; 102 } 103 private string[] removeJunk(const string[] data) 104 { 105 import std..string: strip; 106 string[] sanitized; 107 foreach(tok; data) 108 { 109 if (tok.strip.length > 0) 110 sanitized ~= tok; 111 } 112 return sanitized; 113 } 114 115 unittest 116 { 117 /* Test 1 */ 118 writeln("starting parsing test 1"); 119 string html = 120 `<!DOCTYPE html> 121 <html> 122 <head> 123 <title>hello world</title> 124 </head> 125 <body> 126 <h1>Hello World!</h1> 127 <p>lorem ipsum text</p> 128 </body> 129 </html>`; 130 string[] tokens; 131 tokens = tokenizeDHTML(html).removeJunk; 132 writeln(tokens); 133 assert(tokens[0] == "<!DOCTYPE html>"); 134 assert(tokens[1] == "<html>"); 135 assert(tokens[2] == "<head>"); 136 assert(tokens[3] == "<title>"); 137 assert(tokens[4] == "hello world"); 138 assert(tokens[5] == "</title>"); 139 assert(tokens[6] == "</head>"); 140 assert(tokens[7] == "<body>"); 141 assert(tokens[8] == "<h1>"); 142 assert(tokens[9] == "Hello World!"); 143 assert(tokens[10] == "</h1>"); 144 assert(tokens[11] == "<p>"); 145 assert(tokens[12] == "lorem ipsum text"); 146 assert(tokens[13] == "</p>"); 147 assert(tokens[14] == "</body>"); 148 assert(tokens[15] == "</html>"); 149 150 /* Test 2 */ 151 writeln("starting test 2"); 152 html = 153 `<!DOCTYPE html> 154 <?D 155 import std.stdio; 156 /?> 157 <html> 158 <body> 159 <?D 160 output("Hello World"); 161 /?> 162 </body> 163 </html>`; 164 tokens = tokenizeDHTML(html).removeJunk; 165 writeln(tokens); 166 assert(tokens[1] == 167 `<?D 168 import std.stdio; 169 /?>`); 170 assert(tokens[4] == 171 `<?D 172 output("Hello World"); 173 /?>`); 174 175 176 /* Test 3 */ 177 writeln("starting test 3"); 178 html = 179 `<!DOCTYPE html> 180 <html> 181 {{ variable }} 182 </html>`; 183 184 tokens = tokenizeDHTML(html).removeJunk; 185 writeln(tokens); 186 assert(tokens[0] == "<!DOCTYPE html>"); 187 assert(tokens[1] == "<html>"); 188 assert(tokens[2] == "{{ variable }}"); 189 assert(tokens[3] == "</html>"); 190 } 191 192 /** 193 * Parses tokenized html data. 194 * 195 * Takes in tokenized html data and outputs a set of nodes that contain information about itself. 196 * 197 * Params: 198 * tokens = The tokens that are output from tokenizing the html data. 199 * Returns: Parsed html nodes. 200 */ 201 Node[] parseDHTML(const string[] tokens) 202 { 203 Node[] nodes; 204 Node current; 205 foreach(i, tok; tokens) 206 { 207 if (tok.length == 0) 208 continue; 209 210 if (tok.strip.length >= 4 && tok.strip[0..4] == "<!--") 211 { 212 current.nodeType = NodeType.OpeningTag; 213 current.tagType = TagType.Comment; 214 current.original = tok; 215 nodes ~= current; 216 current = Node.init; 217 continue; 218 } 219 else if (tok.strip.length >= 3 && tok.strip[0..3] == "-->") 220 { 221 current.nodeType = NodeType.ClosingTag; 222 current.tagType = TagType.Comment; 223 current.original = tok; 224 nodes ~= current; 225 current = Node.init; 226 continue; 227 } 228 229 current.original = tok; 230 // first, determine the NodeType 231 if (tok.length >= 2 && tok[0..2] == "</") 232 current.nodeType = NodeType.ClosingTag; 233 else if (tok[0] == '<') 234 current.nodeType = NodeType.OpeningTag; 235 else if (tok.length >= 2 && tok[0..2] == "{{") 236 current.nodeType = NodeType.OpeningTag; 237 else 238 current.nodeType = NodeType.Content; 239 240 // second, determine the tag type 241 if (current.nodeType == NodeType.Content) 242 current.tagType = TagType.None; 243 else if (tok.length >= 3 && tok[0..3] == "<?D") 244 current.tagType = TagType.Code; 245 else if (tok.length >= 2 && tok[0..2] == "{{") 246 current.tagType = TagType.Insert; 247 else 248 current.tagType = TagType.Tag; 249 // next determine the attributes if its a tag 250 if (current.nodeType == NodeType.Content) 251 { 252 current.content = tok; 253 } 254 else if (current.tagType == TagType.Tag) 255 { 256 long idx = tok.indexOf("<"); 257 string workingTag = tok[idx + 1..$-1]; 258 idx = workingTag.indexOf(" "); 259 idx = idx == -1 ? workingTag.length : idx; 260 current.content = workingTag[0..idx]; 261 workingTag = workingTag[idx..$]; 262 current.attributes = cast(Attribute[])[]; 263 if (current.nodeType == NodeType.OpeningTag) 264 { 265 foreach(_zzz; 0..24) 266 { 267 workingTag = workingTag.stripLeft; 268 if (workingTag.length == 0) 269 break; 270 //parse next attribute 271 Attribute attr; 272 long idx1 = workingTag.indexOf(" "); 273 long idx2 = workingTag.indexOf("="); 274 long idx3 = workingTag.indexOf("\""); 275 276 if (idx1 <= idx2 && idx2 + 1 != idx3) 277 { 278 idx1 = idx1 == -1 ? workingTag.length : idx1; 279 attr.attribute = workingTag[0..idx1]; 280 attr.type = AttributeType.Parameter; 281 workingTag = workingTag[idx1..$]; 282 } 283 else if (idx2 + 1 == idx3) // String value 284 { 285 attr.attribute = workingTag[0..idx2]; 286 long idx4 = workingTag.indexOf("\"", idx3+1); 287 idx4 = idx4 == -1 ? workingTag.length : idx4; 288 attr.val = workingTag[idx3+1..idx4]; 289 attr.type = AttributeType.String; 290 if (idx4 + 1 < workingTag.length) 291 workingTag = workingTag[idx4+2..$]; 292 else 293 workingTag = ""; 294 } 295 else if (idx2 > 0)// Number Value 296 { 297 attr.attribute = workingTag[0..idx2]; 298 long idx4 = workingTag.indexOf(" ", idx2); 299 idx4 = idx4 == -1 ? workingTag.length : idx4; 300 attr.val = workingTag[idx2+1..idx4]; 301 attr.type = AttributeType.Number; 302 if (idx4 != workingTag.length) 303 workingTag = workingTag[idx4+1..$]; 304 else 305 workingTag = ""; 306 } 307 else if (idx1 == -1) 308 workingTag = ""; 309 current.attributes ~= attr; 310 } 311 } 312 } 313 else if (current.tagType == TagType.Code) 314 { 315 current.content = tok[3..$-3]; 316 } 317 else if (current.tagType == TagType.Insert) 318 { 319 current.content = tok[2..$-2]; 320 } 321 322 323 nodes ~= current; 324 current = Node.init; 325 } 326 return nodes; 327 }