The ExtractEmailswithInnerText method below will extract emails from an HTML page in the format "name <email@address.com>" using HTMLAgilityPack.
Here's a typical example of an email address we find on a webpage
Email: marketballs@bnnbloomberg.ca
which is marked-up with following HTML
<a href="mailto:marketcall@bnnbloomberg.ca">marketballs@bnnbloomberg.ca</a>
where the inner text is "marketballs@bnnbloomberg.ca".
Therefore the expect result is this aliased email address,
marketballs@bnnbloomberg.ca <marketcall@bnnbloomberg.ca>
Source Code
using System;using System.Text; using System.Linq; public class Program { /// <summary> /// Extract a href with mailto (emails) links with inner text /// </summary> public static string ExtractEmailswithInnerText(string s) { string mailto = "mailto:"; //removal ASCII and UNICODE control characters string h = new String(s.Where(c => !char.IsControl(c)).ToArray()); StringBuilder sb = new StringBuilder(h.Length); try { HtmlAgilityPack.HtmlDocument htmldoc = new HtmlAgilityPack.HtmlDocument(); htmldoc.LoadHtml(h); //var urls = html.DocumentNode.SelectNodes("//a[@href!='']").Select(i => i.Attributes["href"].Value); if (htmldoc.DocumentNode.SelectNodes("//a/@href").Count > 0) { foreach (HtmlAgilityPack.HtmlNode node in htmldoc.DocumentNode.SelectNodes("//a/@href")) { if (node.Name == "a") { if (node.Attributes["href"].Value.StartsWith(mailto)) sb.AppendLine(string.Concat(node.InnerText.Trim(), " <", node.Attributes["href"].Value.Replace(mailto, string.Empty), ">")); } } return sb.ToString(); } else { return string.Empty; } } catch //(Exception ex) { //bad formed HTML error handle it } return string.Empty; } public static void Main() { string html = "<a href=\"https://trojanhorsethebook.com/wordpress/\">Home</a><div class=\"footerLeft\"><p>©<script type=\"text/javascript\">copyright=new Date();update=copyright.getFullYear();document.write(update);</script>2021 Mark Russinovich. All Rights Reserved.</p> <p>Email: <a href=\"mailto:mark@russinovich.com\">mark@russinovich.com</a></p></div>"; //Best practice, we should check for this empty case, but not really common, defeats purpose //string html2 = "<a href=\"https://trojanhorsethebook.com/wordpress/\">Home</a><div class=\"footerLeft\"><p>©<script type=\"text/javascript\">copyright=new Date();update=copyright.getFullYear();document.write(update);</script>2021 Mark Russinovich. All Rights Reserved.</p> <p>Email: <a href=\"mailto:mark@russinovich.com\"></a></p></div>"; string html3 = "<div id=\"post\"><article class=\"article single\"><header class=\"article-header\"><h1>Contact</h1></header><div class=\"article-content\"><p>If you have any questions, comments or concerns, please email help@thurrott.com</p></div></article> </div>"; Console.WriteLine(ExtractEmailswithInnerText(html)); Console.WriteLine(ExtractEmailswithInnerText(html3)); } }
No comments:
Post a Comment