I'll focus on getting the number of rows and columns for an HTML Table.
You can use this algorithm for any language, just use the Regex expressions below to enumerate rows, then count columns (another Regex) within each row.
I wanted to do this to avoid using bulky and unreliable third-party libraries. So, although not advised I used Regular Expressions to do this. Getting the number of rows in an HTML table is simple enough, but for columns, it can be tricky because of jagged tables (tables using colspan=). The other motivation is that this topic was sparsely covered.
I wanted to do this to avoid using bulky and unreliable third-party libraries. So, although not advised I used Regular Expressions to do this. Getting the number of rows in an HTML table is simple enough, but for columns, it can be tricky because of jagged tables (tables using colspan=). The other motivation is that this topic was sparsely covered.
Normally in C# the way to solve this would be to load the document using HTML Agility Pack library, or more modern Angel Sharp library are pretty heavy downloads and requires a detailed security review. Who has time for that!
So, here's the C# regex solution getting HTML Table size live code.
So, here's the C# regex solution getting HTML Table size.
using System; using System.Linq; using System.Collections.Generic; using System.Text.RegularExpressions; public class Program { public static void Main() { string html = @" <table> <caption>Employee Information for https://metadataconsulting.blogspot.com/</caption> <thead> <tr> <th>Name</th> <th>Position</th> <th>Office</th> <th>Age</th> <th>Start Date</th> <th>Salary</th> </tr> </thead> <tbody> <tr class='buttler'> <td>John Doe</td> <td>Software Engineer</td> <td>New York</td> <td>30</td> <td>2015-01-15</td> <td>$120,000</td> </tr> <tr> <td>Jane Smith</td> <td>Project Manager</td> <td>London</td> <td>40</td> <td>2010-03-25</td> <td>$150,000</td> </tr> <tr> <td>Emily Johnson</td> <td>Designer</td> <td>San Francisco</td> <td>28</td> <td>2018-07-12</td> <td>$100,000</td> </tr> </tbody> <tfoot> <tr> <td colspan=6>End of Employee Information</td> </tr> </tfoot> </table>"; string patternRow = @"(<tr.*?>)((?!</tr>).)+?(.*?)(</tr>)"; RegexOptions optionsRow = RegexOptions.Singleline; Regex regxHTMLRow = new Regex(patternRow, optionsRow); int colcnt = 0; int cellcnt = 0; double avgcolcnt = 0; int finalcolcnt = 0; List<int> avgColCnt = new List<int>(); Regex regxColCount = new Regex(@"<t[hd].*>(.*?)</t[hd]>", RegexOptions.Multiline); MatchCollection mcoll = regxHTMLRow.Matches(html); int m = 0; int r = 0; if (mcoll.Count > 0) { foreach (Match match in mcoll) { m++; Console.WriteLine("Match #{0}, Match Row ='{1}'", m, match.Groups[3].Value); MatchCollection mcollColCnt = regxColCount.Matches(match.Groups[3].Value); if (mcoll.Count > 0) { foreach (Match mc in mcollColCnt) { if (mc.Success) { r++; Console.WriteLine("\r\n\t\t\tMatch #{0}, Match Value ='{1}' ", r, mc.Value); cellcnt++; } } avgColCnt.Add(cellcnt); colcnt = Math.Max(colcnt, cellcnt); cellcnt = 0; r = 0; Console.WriteLine(); } } } Console.WriteLine("Max number of columns {0}", colcnt); Console.WriteLine("Number of rows {0}", avgColCnt.Count()); Console.WriteLine("List: " + string.Join(", ", avgColCnt)); //List<int> avgListReduced = avgColCnt.Where(x => x != avgColCnt.Min() && x != avgColCnt.Max()).ToList(); //removes all values that match max avgColCnt.Sort(); List<int> avgListReduced = avgColCnt; if (avgListReduced.Count > 2) { // Remove the first element avgListReduced.RemoveAt(0); // Remove the last element avgListReduced.RemoveAt(avgListReduced.Count - 1); Console.WriteLine("Number of avgListReduced {0}", avgListReduced.Count()); // Print the updated list Console.WriteLine("Updated list: " + string.Join(", ", avgListReduced)); avgcolcnt = avgListReduced.Average(); try { int intavgcolcnt = (int)Math.Round(avgcolcnt, MidpointRounding.AwayFromZero); Console.WriteLine("Average number of columns " + intavgcolcnt); //average effective after 3 or more rows? if (avgListReduced.Count >= 5 && intavgcolcnt != colcnt) finalcolcnt = intavgcolcnt; else finalcolcnt = colcnt; } catch { finalcolcnt = colcnt; } } else { finalcolcnt = colcnt; } Console.WriteLine("Final number of columns {0}",finalcolcnt ); } }
No comments:
Post a Comment