I'll focus on getting the number of rows and columns for an HTML Table.
I wanted to do this to avoid using bulky and unreliable third-party libraries. So, although not advised I used Regular Expressions to do this. Getting the number of rows in an HTML table is simple enough, but for columns, it can be tricky because of jagged tables (tables using colspan=). The other motivation is that this topic was sparsely covered.
using System; using System.Linq; using System.Collections.Generic; using System.Text.RegularExpressions; public class Program { public static void Main() { string html = @" <table> <caption>Employee Information for https://metadataconsulting.blogspot.com/</caption> <thead> <tr> <th>Name</th> <th>Position</th> <th>Office</th> <th>Age</th> <th>Start Date</th> <th>Salary</th> </tr> </thead> <tbody> <tr class='buttler'> <td>John Doe</td> <td>Software Engineer</td> <td>New York</td> <td>30</td> <td>2015-01-15</td> <td>$120,000</td> </tr> <tr> <td>Jane Smith</td> <td>Project Manager</td> <td>London</td> <td>40</td> <td>2010-03-25</td> <td>$150,000</td> </tr> <tr> <td>Emily Johnson</td> <td>Designer</td> <td>San Francisco</td> <td>28</td> <td>2018-07-12</td> <td>$100,000</td> </tr> </tbody> <tfoot> <tr> <td colspan=6>End of Employee Information</td> </tr> </tfoot> </table>"; string patternRow = @"(<tr.*?>)((?!</tr>).)+?(.*?)(</tr>)"; RegexOptions optionsRow = RegexOptions.Singleline; Regex regxHTMLRow = new Regex(patternRow, optionsRow); int colcnt = 0; int cellcnt = 0; double avgcolcnt = 0; int finalcolcnt = 0; List<int> avgColCnt = new List<int>(); Regex regxColCount = new Regex(@"<t[hd].*>(.*?)</t[hd]>", RegexOptions.Multiline); MatchCollection mcoll = regxHTMLRow.Matches(html); int m = 0; int r = 0; if (mcoll.Count > 0) { foreach (Match match in mcoll) { m++; Console.WriteLine("Match #{0}, Match Row ='{1}'", m, match.Groups[3].Value); MatchCollection mcollColCnt = regxColCount.Matches(match.Groups[3].Value); if (mcoll.Count > 0) { foreach (Match mc in mcollColCnt) { if (mc.Success) { r++; Console.WriteLine("\r\n\t\t\tMatch #{0}, Match Value ='{1}' ", r, mc.Value); cellcnt++; } } avgColCnt.Add(cellcnt); colcnt = Math.Max(colcnt, cellcnt); cellcnt = 0; r = 0; Console.WriteLine(); } } } Console.WriteLine("Max number of columns {0}", colcnt); Console.WriteLine("Number of rows {0}", avgColCnt.Count()); Console.WriteLine("List: " + string.Join(", ", avgColCnt)); //List<int> avgListReduced = avgColCnt.Where(x => x != avgColCnt.Min() && x != avgColCnt.Max()).ToList(); //removes all values that match max avgColCnt.Sort(); List<int> avgListReduced = avgColCnt; if (avgListReduced.Count > 2) { // Remove the first element avgListReduced.RemoveAt(0); // Remove the last element avgListReduced.RemoveAt(avgListReduced.Count - 1); Console.WriteLine("Number of avgListReduced {0}", avgListReduced.Count()); // Print the updated list Console.WriteLine("Updated list: " + string.Join(", ", avgListReduced)); avgcolcnt = avgListReduced.Average(); try { int intavgcolcnt = (int)Math.Round(avgcolcnt, MidpointRounding.AwayFromZero); Console.WriteLine("Average number of columns " + intavgcolcnt); //average effective after 3 or more rows? if (avgListReduced.Count >= 5 && intavgcolcnt != colcnt) finalcolcnt = intavgcolcnt; else finalcolcnt = colcnt; } catch { finalcolcnt = colcnt; } } else { finalcolcnt = colcnt; } Console.WriteLine("Final number of columns {0}",finalcolcnt ); } }