using System; using System.Collections; using HtmlAgilityPack; public sealed class SafeHtml { private SafeHtml() {} private static readonly string[] allowedElements = new string[] { // #DOCUMENT and #TEXT are HtmlAgilityPack names "!DOCTYPE", "#DOCUMENT", "#TEXT", "A", "ACRONYM", "ADDRESS", "AREA", "B", "BASE", "BASEFONT", "BDO", "BIG", "BLOCKQUOTE", "BODY", "BR", "CAPTION", "CENTER", "CITE", "CODE", "COL", "COLGROUP", "DD", "DEL", "DFN", "DIR", "DIV", "DL", "EM", "FONT", "HEAD", "H1", "H2", "H3", "H4", "H5", "H6", "HR", "HTML", "I", "IMG", "INS", "KBD", "LABEL", "LI", "LISTING", "MAP", "MARQUEE", "MENU", "NOBR", "NOSCRIPT", "OL", "P", "PRE", "Q", "RT", "RUBY", "S", "SAMP", "SMALL", "SPAN", "STRIKE", "STRONG", "SUB", "SUP", "TABLE", "TBODY", "TD", "TFOOT", "TH", "THEAD", "TITLE", "TR", "TT", "U", "UL", "VAR", "WBR", "XMP" }; private static readonly string[] allowedAttributes = new string[] { "ABBR", "ADDITIVE", "ALIGN", "ALLOWTRANSPARENCY", "ALT", "ATOMICSELECTION", "AXIS", "BACKGROUND", "BGCOLOR", "BGPROPERTIES", "BORDER", "BORDERCOLOR", "CAPTION", "CELLPADDING", "CELLSPACING", "CITE", "CLEAR", "COLOR", "COLS", "COLSPAN", "COMPACT", "COORDS", "DIR", "DISABLED", "DYNSRC", "FACE", "FGCOLOR", "FONT", "HEADERS", "HEIGHT", "HREF", "HREFLANG", "HSPACE", "ID", "LENGTH", "LINK", "LOWSRC", "MEDIA", "NAME", "NOHREF", "NOSHADE", "NOWRAP", "REL", "REV", "ROWS", "ROWSPAN", "SCROLL", "SHAPE", "SIZE", "SPAN", "SRC", "START", "STYLE", "TARGET", "TEXT", "TITLE", "TYPE", "UNSELECTABLE", "URN", "VALIGN", "VALUE", "VSPACE", "WIDTH", "WRAP" }; public static string CleanHtml(string html) { // Load data HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); // Start processing processNode(doc.DocumentNode); return doc.DocumentNode.ChildNodes[0].WriteTo(); } private static void processNode(HtmlNode node) { // Check to see if element is not allowed if (Array.BinarySearch(allowedElements, node.Name.ToUpper()) < 0) { // Rename node to something simple, remove all children and attributes node.Name = "PRE"; node.RemoveAll(); return; } // Check all attributes, adding bad ones to removeList ArrayList removeList = new ArrayList(); foreach(HtmlAttribute att in node.Attributes) { // If attribute is not allowed, add for removal if (Array.BinarySearch(allowedAttributes, att.Name.ToUpper()) < 0) { removeList.Add(att); } else { // Check for style, to disable behaviors if (att.Name.ToUpper() == "STYLE") { cleanStyle(att); } } } // Remove attributes foreach(HtmlAttribute att in removeList) { node.Attributes.Remove(att); } // Process child nodes foreach(HtmlNode child in node.ChildNodes) { processNode(child); } } private static void cleanStyle(HtmlAttribute styleAttribute) { // HACK: Cheap and quick way to remove positioning and behaviors from a style string val = styleAttribute.Value.ToLower(); val = val.Replace("behavior", ""); val = val.Replace("position", ""); val = val.Replace("top", ""); val = val.Replace("left", ""); styleAttribute.Value = val; } }