Normally in C# the way to solve this would be to load the document using HTML Agility Pack library, or more modern Angel Sharp library are pretty heavy downloads and requires a detailed security review. Who has time for that!
So, here's the regex solution to counting HTML columns live code.
So here's the regex solution to counting HTML columns.
using System; using System.Linq; using System.Collections.Generic; using System.Text.RegularExpressions; public class Program { public static void Main() { string html = @" <table> <caption>Employee Information for https://metadataconsulting.blogspot.com/</caption> <thead> <tr> <th>Name</th> <th>Position</th> <th>Office</th> <th>Age</th> <th>Start Date</th> <th>Salary</th> </tr> </thead> <tbody> <tr class='buttler'> <td>John Doe</td> <td>Software Engineer</td> <td>New York</td> <td>30</td> <td>2015-01-15</td> <td>$120,000</td> </tr> <tr> <td>Jane Smith</td> <td>Project Manager</td> <td>London</td> <td>40</td> <td>2010-03-25</td> <td>$150,000</td> </tr> <tr> <td>Emily Johnson</td> <td>Designer</td> <td>San Francisco</td> <td>28</td> <td>2018-07-12</td> <td>$100,000</td> </tr> </tbody> <tfoot> <tr> <td colspan=6>End of Employee Information</td> </tr> </tfoot> </table>"; string patternRow = @"(<tr.*?>)((?!</tr>).)+?(.*?)(</tr>)"; RegexOptions optionsRow = RegexOptions.Singleline; Regex regxHTMLRow = new Regex(patternRow, optionsRow); int colcnt = 0; int cellcnt = 0; double avgcolcnt = 0; int finalcolcnt = 0; List<int> avgColCnt = new List<int>(); Regex regxColCount = new Regex(@"<t[hd].*>(.*?)</t[hd]>", RegexOptions.Multiline); MatchCollection mcoll = regxHTMLRow.Matches(html); int m = 0; int r = 0; if (mcoll.Count > 0) { foreach (Match match in mcoll) { m++; Console.WriteLine("Match #{0}, Match Row ='{1}'", m, match.Groups[3].Value); MatchCollection mcollColCnt = regxColCount.Matches(match.Groups[3].Value); if (mcoll.Count > 0) { foreach (Match mc in mcollColCnt) { if (mc.Success) { r++; Console.WriteLine("\r\n\t\t\tMatch #{0}, Match Value ='{1}' ", r, mc.Value); cellcnt++; } } avgColCnt.Add(cellcnt); colcnt = Math.Max(colcnt, cellcnt); cellcnt = 0; r = 0; Console.WriteLine(); } } } Console.WriteLine("Max number of columns {0}", colcnt); Console.WriteLine("Number of rows {0}", avgColCnt.Count()); Console.WriteLine("List: " + string.Join(", ", avgColCnt)); //List<int> avgListReduced = avgColCnt.Where(x => x != avgColCnt.Min() && x != avgColCnt.Max()).ToList(); //removes all values that match max avgColCnt.Sort(); List<int> avgListReduced = avgColCnt; if (avgListReduced.Count > 2) { // Remove the first element avgListReduced.RemoveAt(0); // Remove the last element avgListReduced.RemoveAt(avgListReduced.Count - 1); Console.WriteLine("Number of avgListReduced {0}", avgListReduced.Count()); // Print the updated list Console.WriteLine("Updated list: " + string.Join(", ", avgListReduced)); avgcolcnt = avgListReduced.Average(); try { int intavgcolcnt = (int)Math.Round(avgcolcnt, MidpointRounding.AwayFromZero); Console.WriteLine("Average number of columns " + intavgcolcnt); //average effective after 3 or more rows? if (avgListReduced.Count >= 5 && intavgcolcnt != colcnt) finalcolcnt = intavgcolcnt; else finalcolcnt = colcnt; } catch { finalcolcnt = colcnt; } } else { finalcolcnt = colcnt; } Console.WriteLine("Final number of columns {0}",finalcolcnt ); } }
No comments:
Post a Comment