Monday, June 17, 2019

How to set-up Tidy to properly read HTML pages and encodings

After many hours of painstaking analysis, it turns out https://github.com/markbeaton/TidyManaged package cannot read a HTML string correctly.

It could be an implementation error of the Tidymanaged library or a subtle fact about .NET  4.0 Framework, strings are actually stored as UTF-16 and the conversion screws up HTML entities in particular. This was supposed to be fixed in this version of the framework and above, but perhaps persists.

So say cleanHTML contains HTML from a webpage, which was encoded in UTF-8. 

Well turns out, this gets incorrectly formatted when pushed into this method; 

TidyManaged.Document.FromString(cleanHtml);

You have to use a MemoryStream to feed UFT-8 encoding properly.

TidyManaged.Document.FromStream(HTMLinput);

 See code below to properly handle HTML documents for tidy in dotNet.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
            
            //After many hours of painstaking analysis, this how to get HTML to be passed correctly 
            byte[] encodedBytesUTF8 = Encoding.UTF8.GetBytes(htmlText);
            MemoryStream HTMLinput = new MemoryStream(encodedBytesUTF8);
            
            try
            {
                //THIS DOES NOT WORK, it re-encodes the string improperly, after many hours of painstaking anaylsis
                //tidydoc = TidyManaged.Document.FromString(cleanHtml); 
                
                tidydoc = TidyManaged.Document.FromStream(HTMLinput);
            }
            catch (Exception tex)
            {
                return "tidy HTML parsing error: " + tex.Message;
            }
           
            using ( tidydoc )
            {
                //http://api.html-tidy.org/tidy/quickref_5.0.0.html
                
                tidydoc.ShowWarnings = false;
                tidydoc.Quiet = true;
                tidydoc.ForceOutput = true;
                tidydoc.OutputBodyOnly = AutoBool.Auto;

                tidydoc.DocType = TidyManaged.DocTypeMode.Omit;
                tidydoc.DropFontTags = false;
                tidydoc.UseLogicalEmphasis = false;
                tidydoc.LowerCaseLiterals = false;
                
                tidydoc.OutputXhtml = false;
                tidydoc.OutputXml = false;
                
                tidydoc.MakeClean = false;
                
                tidydoc.DropEmptyParagraphs = false;
                tidydoc.CleanWord2000 = false;

                tidydoc.QuoteAmpersands = false; //This option specifies if Tidy should output unadorned & characters as &.
                tidydoc.AsciiEntities = false;   //Can be used to modify behavior of -c (--clean yes) option. If set to "yes" when using -c, &emdash;, ”, and other named character entities are downgraded to their closest ascii equivalents.
                tidydoc.PreserveEntities = true; //This option specifies if Tidy should preserve the well-formed entities as found in the input.
                tidydoc.OutputNumericEntities = true; //This option specifies if Tidy should output entities other than the built-in HTML entities (&, <, > and ") in the numeric rather than the named entity form

                tidydoc.JoinStyles = false;
                tidydoc.JoinClasses = false;
                
                tidydoc.Markup = true; //prettify open

                tidydoc.WrapAt = 0;
                tidydoc.IndentSpaces = 4;
                tidydoc.IndentBlockElements = TidyManaged.AutoBool.Yes; // this increases file size! (but makes it better to read)


                tidydoc.InputCharacterEncoding = TidyManaged.EncodingType.Utf8;
                tidydoc.CharacterEncoding = TidyManaged.EncodingType.Utf8; //For raw, Tidy will output values above 127 without translating them into entities.
                tidydoc.OutputCharacterEncoding = TidyManaged.EncodingType.Utf8;

                
                tidydoc.JoinStyles = false;
                tidydoc.MergeDivs = AutoBool.No;
                tidydoc.MergeSpans = AutoBool.No;
                tidydoc.OutputHtml = true;
                

                tidydoc.UseXmlParser = false;
                tidydoc.AddTidyMetaElement = false;
                
                
                try
                {
                    tidydoc.CleanAndRepair(); //required
                }
                catch (Exception car)
                {
                    return "tidy HTML clean && repair error: " + car.Message;
                }
                try
                {
                    cleanHtml = tidydoc.Save();
                }
                catch (Exception save)
                {
                    return "tidy HTML save error: " + save.Message;
                }
            }

No comments:

Post a Comment