Code-Only: A super basic HTML style parser using BasicLex and a SymbolTable
/*
Copyright (c) 2004 DigiTec Web Consultants, LLC. All rights reserved.
The use of this software is for test and performance purposes only.
You may not use this software in any commercial applications without
the express permission of the copyright holder. You may add to or
modify the code contained here-in to cause it to run slower without
contacting the copyright holder, however, any attempts to make this
code run faster should be documented on:
http://weblogs.asp.net/justin_rogers/articles/CodeOnly_HTMLParser1.aspx
I reserve the right to change or modify the publicly available version
of this code at any time. I will not provide version protection, so
if you have reliance on a particular build of this software, then make
your own back-ups.
You must laugh, at least a little, when reading this licensing agreement,
unless of course you don't have a sense of humor. In all seriousness,
excluding the laughter, laughter in itself does not void this license
agreement, nor compromise it's ability to legally bind you.
You must not remove this notice, or any other, from this software.
*/
using System;
using System.Collections;
using System.Xml;
public enum TokenType {
Unknown = 0,
Id = 1,
LeftAngleBracket = 2,
RightAngleBracket = 3,
RightLeaningToothpick = 4,
Assignment = 5,
StringDelimiter1 = 7,
StringDelimiter2 = 8,
Whitespace = 9,
Body = 20,
Paragraph = 21,
Span = 22,
Div = 23
}
public class TypedToken {
public Token Token;
public TokenType TokenType;
public string TokenData;
public TypedToken(Token token, TokenType tokenType) { this.TokenType = tokenType; this.Token = token; this.TokenData = token.TokenData; }
}
public enum ParseState {
Error = 0,
Open = 1,
ParseElementStart = 2,
ParseElementAttributes = 3,
ParseElementEnd = 4,
ParseElementAttribute = 5,
ParseAttributeAssignment = 6,
ParseNestEndElement = 7
}
public class HTMLParser {
private TokenStream tokenStream;
private XmlDocument currentDocument = null;
private XmlNode currentNode = null;
private XmlAttribute currentAttribute = null;
private Stack contextStack = null;
private ParseState parseState;
private SymbolTable symbols;
public HTMLParser(Token[] tokens) {
this.symbols = new SymbolTable(TokenType.Id, false);
this.symbols.AddKeyword("body", TokenType.Body);
this.symbols.AddKeyword("p", TokenType.Paragraph);
this.symbols.AddKeyword("span", TokenType.Span);
this.symbols.AddKeyword("div", TokenType.Div);
this.tokenStream = new TokenStream((Token[]) tokens.Clone(), this.symbols);
}
public XmlDocument CompileToXml() {
this.tokenStream.Reset();
this.parseState = ParseState.Open;
this.currentDocument = new XmlDocument();
this.currentDocument.LoadXml("<HTML xml:space=\"preserve\"></HTML>");
this.contextStack = new Stack();
this.contextStack.Push(this.currentDocument.DocumentElement);
while(tokenStream.MoveNext()) {
Console.WriteLine("Parsing State: {0}", this.parseState);
switch(this.parseState) {
case ParseState.Open:
this.parseState = TransitionOpen();
break;
case ParseState.ParseElementAttributes:
this.parseState = TransitionParseElementAttributes();
break;
case ParseState.ParseNestEndElement:
this.parseState = TransitionParseNestEndElement();
break;
case ParseState.ParseElementStart:
this.parseState = TransitionParseElementStart();
break;
case ParseState.ParseAttributeAssignment:
this.parseState = TransitionParseAttributeAssignment();
break;
default:
throw new Exception("There was a stream error");
}
}
return this.currentDocument;
}
public ParseState TransitionParseAttributeAssignment() {
this.tokenStream.ScanPast(TokenType.Whitespace);
switch(this.tokenStream.Current.TokenType) {
case TokenType.StringDelimiter1:
case TokenType.StringDelimiter2:
TokenType endType = this.tokenStream.Current.TokenType;
// Allows embedded HTML constructs, you can remove this quite easily
// with an extra check
while(this.tokenStream.MoveNext() && this.tokenStream.Current.TokenType != endType) {
this.currentAttribute.Value += this.tokenStream.Current.Token.TokenData;
}
if ( this.tokenStream.Current.TokenType == endType ) {
this.currentNode.Attributes.SetNamedItem(this.currentAttribute);
this.currentAttribute = null;
return ParseState.ParseElementAttributes;
}
break;
default:
if ( this.tokenStream.Current.TokenType == TokenType.Id || this.tokenStream.Current.TokenType >= TokenType.Body ) {
this.currentAttribute.Value += this.tokenStream.Current.Token.TokenData;
this.currentNode.Attributes.SetNamedItem(this.currentAttribute);
this.currentAttribute = null;
return ParseState.ParseElementAttributes;
}
break;
}
return ParseState.Error;
}
public ParseState TransitionParseElementAttributes() {
this.tokenStream.ScanPast(TokenType.Whitespace);
// It is important that Body be the first keyword and anything after it be a keyword.
// We can have attributes that have the same name as a tag that we are parsing.
if ( this.tokenStream.Current.TokenType == TokenType.Id || this.tokenStream.Current.TokenType >= TokenType.Body ) {
this.currentAttribute = this.currentDocument.CreateAttribute(this.tokenStream.Current.Token.TokenData);
if ( this.tokenStream.MoveNext() ) {
this.tokenStream.ScanPast(TokenType.Whitespace);
if ( !this.tokenStream.EndOfStream() ) {
if ( this.tokenStream.Current.TokenType == TokenType.Assignment ) {
return ParseState.ParseAttributeAssignment;
} else {
this.currentAttribute.Value = this.currentAttribute.LocalName;
this.currentNode.Attributes.SetNamedItem(this.currentAttribute);
this.currentAttribute = null;
return ParseState.ParseElementAttributes;
}
}
}
} else if ( this.tokenStream.Current.TokenType == TokenType.RightAngleBracket ) {
// We are ending the node, but entering a nested context
this.contextStack.Push(this.currentNode);
this.currentNode = null;
return ParseState.Open;
} else if ( this.tokenStream.Current.TokenType == TokenType.RightLeaningToothpick ) {
// We are ending the node without popping it onto the stack.
if ( this.tokenStream.MoveNext() && this.tokenStream.Current.TokenType == TokenType.RightAngleBracket) {
return ParseState.Open;
}
}
return ParseState.Error;
}
public ParseState TransitionParseNestEndElement() {
if ( this.contextStack.Count > 1 ) {
if(this.tokenStream.Current.TokenData == ((XmlNode) this.contextStack.Peek()).LocalName) {
if ( this.tokenStream.MoveNext() && this.tokenStream.Current.TokenType == TokenType.RightAngleBracket ) {
this.contextStack.Pop();
return ParseState.Open;
}
}
}
return ParseState.Error;
}
public ParseState TransitionParseElementStart() {
switch(this.tokenStream.Current.TokenType) {
case TokenType.Div:
case TokenType.Paragraph:
case TokenType.Body:
case TokenType.Span:
case TokenType.Id:
AllocateNode();
return ParseState.ParseElementAttributes;
case TokenType.RightLeaningToothpick:
return ParseState.ParseNestEndElement;
default:
return ParseState.Error;
}
}
public ParseState TransitionOpen() {
// In the document element, let's scan whitespace to the start
if ( this.contextStack.Count == 1 ) {
this.tokenStream.ScanPast(TokenType.Whitespace);
}
if ( !this.tokenStream.EndOfStream() ) {
if ( this.tokenStream.Current.TokenType == TokenType.LeftAngleBracket ) {
return ParseState.ParseElementStart;
} else {
((XmlNode) this.contextStack.Peek()).AppendChild(this.currentDocument.CreateTextNode(this.tokenStream.Current.TokenData));
}
}
return ParseState.Open;
}
private void AllocateNode() {
this.currentNode = this.currentDocument.CreateElement(this.tokenStream.Current.TokenData);
((XmlNode) this.contextStack.Peek()).AppendChild(this.currentNode);
}
}
public class TokenStream {
private Token[] tokens;
private int streamOffset;
private TypedToken current;
private SymbolTable syms;
public TokenStream(Token[] tokens, SymbolTable syms) {
this.tokens = tokens;
this.streamOffset = -1;
this.syms = syms;
}
public void Reset() {
this.streamOffset = -1;
this.syms.ClearIdentifiers();
}
public bool MoveNext() {
if ( !EndOfStream() ) {
this.streamOffset++;
this.current = null;
}
return (!EndOfStream());
}
public TypedToken Current {
get {
if ( this.streamOffset == -1 ) {
throw new Exception("You must call MoveNext() before using the stream");
}
if ( this.current == null && !EndOfStream() ) {
this.current = ConvertTokenToType(this.tokens[this.streamOffset]);
}
return this.current;
}
}
public void ScanPast(TokenType tokenType) {
ScanPast(new TokenType[] { tokenType });
}
public void ScanPast(TokenType[] tokenTypes) {
while(!EndOfStream()) {
for(int i = 0; i < tokenTypes.Length; i++) {
if ( Current.TokenType == tokenTypes[i] ) {
MoveNext();
break;
}
if ( i == (tokenTypes.Length - 1) ) {
return;
}
}
}
}
public bool EndOfStream() {
return (this.streamOffset >= tokens.Length);
}
private TypedToken ConvertTokenToType(Token token) {
TokenType tokenType = TokenType.Unknown;
switch(token.TokenData) {
case ">":
tokenType = TokenType.RightAngleBracket;
break;
case "<":
tokenType = TokenType.LeftAngleBracket;
break;
case "=":
tokenType = TokenType.Assignment;
break;
case "/":
tokenType = TokenType.RightLeaningToothpick;
break;
case "\"":
tokenType = TokenType.StringDelimiter1;
break;
case "'":
tokenType = TokenType.StringDelimiter2;
break;
case " ":
tokenType = TokenType.Whitespace;
break;
default:
tokenType = syms.AddString(token.TokenData);
break;
}
return new TypedToken(token, tokenType);
}
}