
So, say cleanHTML string contains HTML from a webpage, which was encoded in UTF-8.
Well turns out, this gets incorrectly formatted when pushed into this method;
TidyManaged.Document.FromString(cleanHtml);
You have to use a MemoryStream to feed UFT-8 encoding properly.
TidyManaged.Document.FromStream(HTMLinput);
See code below to properly handle HTML documents for tidy in dotNet.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
//After many hours of painstaking analysis, this how to get HTML to be passed correctly
byte[] encodedBytesUTF8 = Encoding.UTF8.GetBytes(htmlText);
MemoryStream HTMLinput = new MemoryStream(encodedBytesUTF8);
try
{
//THIS DOES NOT WORK, it re-encodes the string improperly, after many hours of painstaking anaylsis
//tidydoc = TidyManaged.Document.FromString(cleanHtml);
tidydoc = TidyManaged.Document.FromStream(HTMLinput);
}
catch (Exception tex)
{
return "tidy HTML parsing error: " + tex.Message;
}
using ( tidydoc )
{
//http://api.html-tidy.org/tidy/quickref_5.0.0.html
tidydoc.ShowWarnings = false;
tidydoc.Quiet = true;
tidydoc.ForceOutput = true;
tidydoc.OutputBodyOnly = AutoBool.Auto;
tidydoc.DocType = TidyManaged.DocTypeMode.Omit;
tidydoc.DropFontTags = false;
tidydoc.UseLogicalEmphasis = false;
tidydoc.LowerCaseLiterals = false;
tidydoc.OutputXhtml = false;
tidydoc.OutputXml = false;
tidydoc.MakeClean = false;
tidydoc.DropEmptyParagraphs = false;
tidydoc.CleanWord2000 = false;
tidydoc.QuoteAmpersands = false; //This option specifies if Tidy should output unadorned & characters as &.
tidydoc.AsciiEntities = false; //Can be used to modify behavior of -c (--clean yes) option. If set to "yes" when using -c, &emdash;, ”, and other named character entities are downgraded to their closest ascii equivalents.
tidydoc.PreserveEntities = true; //This option specifies if Tidy should preserve the well-formed entities as found in the input.
tidydoc.OutputNumericEntities = true; //This option specifies if Tidy should output entities other than the built-in HTML entities (&, <, > and ") in the numeric rather than the named entity form
tidydoc.JoinStyles = false;
tidydoc.JoinClasses = false;
tidydoc.Markup = true; //prettify open
tidydoc.WrapAt = 0;
tidydoc.IndentSpaces = 4;
tidydoc.IndentBlockElements = TidyManaged.AutoBool.Yes; // this increases file size! (but makes it better to read)
tidydoc.InputCharacterEncoding = TidyManaged.EncodingType.Utf8;
tidydoc.CharacterEncoding = TidyManaged.EncodingType.Utf8; //For raw, Tidy will output values above 127 without translating them into entities.
tidydoc.OutputCharacterEncoding = TidyManaged.EncodingType.Utf8;
tidydoc.JoinStyles = false;
tidydoc.MergeDivs = AutoBool.No;
tidydoc.MergeSpans = AutoBool.No;
tidydoc.OutputHtml = true;
tidydoc.UseXmlParser = false;
tidydoc.AddTidyMetaElement = false;
try
{
tidydoc.CleanAndRepair(); //required
}
catch (Exception car)
{
return "tidy HTML clean && repair error: " + car.Message;
}
try
{
cleanHtml = tidydoc.Save();
}
catch (Exception save)
{
return "tidy HTML save error: " + save.Message;
}
}
|
No comments:
Post a Comment