A Simple HTML Parser in JavaScript

The JavaScript code below, when run, prompts the user to enter some HTML text. When the “Parse” button is clicked, that text will be parsed into an instance of a class named “HTMLDocument”, which contains within it objects corresponding to the elements and attributes of the HTML document. This object will then be converted back to text and displayed in order to verify that the HTML was parsed correctly.

To see the code in action, copy it into an .html file and open that file in a web browser that runs JavaScript. Click the “Load Demo Text” button to populate the text field with a simple HTML document for testing.

Obviously there are functions built into JavaScript that can do this same thing, much better and more efficiently. At least one such method has actually been discussed in a previous post. But this program was written as a starting point for creating parsers for other formats, or in other programming languages.


<html>
<body>

	<!-- ui -->
	<div>
		<div><label>Layout:</label></div>
		<div><textarea id="inputStringToParse" cols="80" rows="20"></textarea></div>
	</div>
	<div>
		<button id="buttonLoadDemoText" onclick="buttonLoadDemoText_Clicked();">Load Demo Text</button>
		<button id="buttonParse" onclick="buttonParse_Clicked();">Parse</button>
	</div>
	<div>
		<label>Output:</label>
		<div id="divOutput"></div>
	</div>
	
<script type="text/javascript">

// ui event handlers

function buttonLoadDemoText_Clicked()
{
	// Any full tags in the string 
	// will be interpreted as real tags in the page,
	// so break tags up into substrings and concatenate.

	var demoText =
		"<" + "html><" + "body>"
		+ "<" + "script type='text/javascript'>alert('hello');<" + "/script>"
		+ "<" + "/body><" + "/html>";
	var inputStringToParse = document.getElementById("inputStringToParse");
	inputStringToParse.value = demoText;
}
	
function buttonParse_Clicked()
{
	var inputStringToParse = document.getElementById("inputStringToParse");
	var stringToParse = inputStringToParse.value;
	var htmlDocument = HTMLParser.stringToHTMLDocument(stringToParse);
	var htmlDocumentAsString = htmlDocument.toString();
	var divOutput = document.getElementById("divOutput");
	divOutput.innerHTML = htmlDocumentAsString;
}

// extensions

function StringExtensions()
{
	// extension class
}
{
	String.prototype.splitAndIgnoreEmptyStrings = function(delimiter)
	{
		return this.split(delimiter).filter
		(
			function(s) { return s != "";} 
		);
	}
}

// classes

function HTMLAttribute(name, value)
{
	this.name = name;
	this.value = value;
}
{
	HTMLAttribute.prototype.toString = function()
	{
		var returnValue = this.name + "=" + this.value;
		return returnValue;
	}
}

function HTMLDocument(elementRoot)
{
	this.elementRoot = elementRoot;
}
{
	HTMLDocument.prototype.toString = function()
	{
		var returnValue = this.elementRoot.toString();
		return returnValue;
	}
}

function HTMLElement(parent)
{
	this.parent = parent;
	this.tagName = null;
	this.attributes = [];
	this.content = null;
	this.children = [];

	if (this.parent != null)
	{
		this.parent.children.push(this);
	}
}
{
	HTMLElement.prototype.toString = function()
	{
		var returnValue = 
			"<" + this.tagName + " ";

		for (var a = 0; a < this.attributes.length; a++)
		{
			var attribute = this.attributes[a];
			var attributeAsString = attribute.toString();
			returnValue += attributeAsString;
		}

		if (this.children.length == 0 && this.content == null)
		{
				returnValue += "/>";
		}
		else
		{
			returnValue += ">";

			if (this.content == null)
			{
				for (var c = 0; c < this.children.length; c++)
				{
					var child = this.children[c];
					returnValue += child.toString();
				}
			}
			else
			{
				returnValue += this.content;
			}

			returnValue += "</" + this.tagName + ">";

		}

		returnValue = returnValue.split("<").join("&lt;");
		returnValue = returnValue.split(">").join("&gt;");

		return returnValue;
	}
}

function HTMLParser()
{
	// static class
}
{
	HTMLParser.stringToHTMLDocument = function(stringToParse)
	{
		var tagsAndContentsAsStrings = stringToParse.splitAndIgnoreEmptyStrings("<");

		var elementCurrent = null;
		var elementRoot;

		for (var t = 0; t < tagsAndContentsAsStrings.length; t++)
		{
			var tagAndContentAsString = tagsAndContentsAsStrings[t];
			
			if (tagAndContentAsString.startsWith("/") == true)
			{
				// close tag
				elementCurrent = elementCurrent.parent;
			}
			else
			{
				elementCurrent = new HTMLElement(elementCurrent);

				if (elementCurrent.parent == null)
				{
					elementRoot = elementCurrent;	
				}

				var tagAndContent = tagAndContentAsString.splitAndIgnoreEmptyStrings(">");

				var openTagAsString = tagAndContent[0];

				if (openTagAsString.endsWith("/") == true)
				{
					// self-closing tag
					elementCurrent = elementParent;
				}
				else if (tagAndContent.length > 1)
				{
					elementCurrent.content = tagAndContent[1];	
				}

				var tagNameAndAttributesAsStrings = openTagAsString.splitAndIgnoreEmptyStrings(" ");

				elementCurrent.tagName = tagNameAndAttributesAsStrings[0];

				for (var a = 1; a < tagNameAndAttributesAsStrings.length; a++)
				{
					var attributeAsString = tagNameAndAttributesAsStrings[a];
					var attributeNameAndValue = attributeAsString.split("=");
					var attributeName = attributeNameAndValue[0];
					var attributeValue = attributeNameAndValue[1];
					var attribute = new HTMLAttribute
					(
						attributeName, attributeValue
					);
					elementCurrent.attributes.push(attribute);
				}
			}			
		}

		var returnValue = new HTMLDocument(elementRoot);

		return returnValue;
	}
}

</script>
</body>
</html>

Advertisements
This entry was posted in Uncategorized and tagged , , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s