Creating *Valid* XHTML Documents

While working on our CMS, we ran into a common problem that no one seems to have solved completely yet, creating valid XHTML documents from IE's non-validated HTML editing component. The common suggestion with .NET is to use the SgmlReader to make a pass over your doc and then just output directly to an XmlWriter using WriteNode. While it may seem like a good suggestion, you don't really gain much with this approach, because what you are generating is definately not XHTML, it is HTML with closing tags. The following block of code shows how to extend this method to create truly valid XHTML. For the sake of simplicity, there are a couple missing pieces from this code (like handling element nesting appropriately and running a validation against the schema after conversion to make sure you were able to convert the doc), but this should get you started in the right direction.

using System;
using Sgml;
using System.IO;
using System.Xml.Schema;
using System.Xml;
using System.Text;
using System.Web;
using System.Collections;
using System.Collections.Specialized;
namespace Activehead.Empower.Conversion
{
 public class HtmlConverter
 {
  public HtmlConverter()
  {
   using(Stream schemaStream = File.OpenRead("xhtml.xsd"))
   {
    schema = XmlSchema.Read(schemaStream, null );
    schemaStream.Close();
    schema.Compile(null);
   }
  }
  
  XmlSchema schema;
  string xhtmlNamespace = "http://www.w3.org/1999/xhtml";
  bool IsValidElement(XmlElement node)
  {
   string nodeName = node.LocalName.ToLower();
   if(schema.Elements[new XmlQualifiedName(nodeName, xhtmlNamespace)] != null)
   {
    return true;
   }
   return false;
  }
  
  bool IsValidAttribute(XmlElement parent, XmlAttribute attribute)
  {
   string nodeName = parent.LocalName.ToLower();   
   string attributeName = attribute.LocalName.ToLower();
   
   XmlSchemaElement e = schema.Elements[new XmlQualifiedName(nodeName, xhtmlNamespace)] as XmlSchemaElement;
   if(e != null)
   {
    XmlSchemaComplexType ct = e.SchemaType as XmlSchemaComplexType;
    if(ct != null)
    {
     foreach(XmlSchemaAttribute a in ct.AttributeUses.Values)
     {
      if(a.QualifiedName.Name == attribute.Name)
      {
       return true;
      }
     }
    }
   }
     
   return false;
  }
  string ProcessString(string strInputHtml)
  { 
   string strOutputXhtml = String.Empty;
   SgmlReader reader = new SgmlReader(); 
   reader.DocType ="HTML"; 
   StringReader sr = new System.IO.StringReader(strInputHtml); 
   reader.InputStream = sr;
   StringWriter sw = new StringWriter();
   XmlTextWriter w =new XmlTextWriter(sw);
   reader.Read();
   while(!reader.EOF)
   {
    w.WriteNode(reader,true);
   } 
   w.Flush();
   w.Close(); 

   return sw.ToString();
  } 
  void ProcessNode(XmlNode parent)
  {
   for(int i = 0; i < parent.ChildNodes.Count; i++)
   {
    XmlNode node = parent.ChildNodes[i];
    if(node is XmlElement && !IsValidElement((XmlElement)node))
    {
     XmlNodeList content = node.ChildNodes;
     parent.RemoveChild(node);
     foreach(XmlNode child in content)
     {
      parent.AppendChild(child);
     }
     i--;
    }
    else
    {
     for(int n = 0; n < parent.Attributes.Count; n++)
     {
      if(!IsValidAttribute((XmlElement)parent, parent.Attributes[n]))
      {
       parent.Attributes.Remove(parent.Attributes[n]);
       n--;
      }
     }
     ProcessNode(node);
    }
   } 
  }
  public XmlDocument ToXHtml(string html)
  {
   string xhtml = ProcessString(html);
   XmlDocument doc = new XmlDocument();
   doc.LoadXml(xhtml);
   ProcessNode(doc.DocumentElement);
   return doc;
  }
 }
}

No Comments