Friday, January 8, 2021

C# .NET How to get all emails from a HTML page with a href inner text


The ExtractEmailswithInnerText method below will extract emails from an HTML page in the format "name <email@address.com>" using HTMLAgilityPack.

Here's a typical example of  an email address we find on a webpage


which is marked-up with following HTML 

<a href="mailto:marketcall@bnnbloomberg.ca">marketballs@bnnbloomberg.ca</a>

where the inner text is "marketballs@bnnbloomberg.ca".

Therefore the expect result is this aliased email address,

marketballs@bnnbloomberg.ca <marketcall@bnnbloomberg.ca>




Source Code

using System;using System.Text; using System.Linq;using HtmlAgilityPack; 
					
public class Program
{
	
	/// <summary>
	/// Extract a href with mailto (emails) links with inner text
	/// </summary>
	public static string ExtractEmailswithInnerText(string s)
	{
		string mailto = "mailto:"; 
		//removal ASCII and UNICODE control characters
		string h = new String(s.Where(c => !char.IsControl(c)).ToArray());

		StringBuilder sb = new StringBuilder(h.Length); 
		
		try
		{
			HtmlAgilityPack.HtmlDocument htmldoc  = new HtmlAgilityPack.HtmlDocument();
			htmldoc.LoadHtml(h);
			//var urls = html.DocumentNode.SelectNodes("//a[@href!='']").Select(i => i.Attributes["href"].Value);

			if (htmldoc.DocumentNode.SelectNodes("//a/@href").Count > 0)
			{

				foreach (HtmlAgilityPack.HtmlNode node in htmldoc.DocumentNode.SelectNodes("//a/@href"))
				{
					if (node.Name == "a")
					{
						if (node.Attributes["href"].Value.StartsWith(mailto))
							sb.AppendLine(string.Concat(node.InnerText.Trim(), " <", node.Attributes["href"].Value.Replace(mailto, string.Empty), ">"));
					}

				}

				return sb.ToString();
			}
			else
			{
				return string.Empty;
			}
		}
		catch //(Exception ex)
		{
			//bad formed HTML error handle it
		}

		return string.Empty;
	}
	
	public static void Main()
	{
		string html = "<a href=\"https://trojanhorsethebook.com/wordpress/\">Home</a><div class=\"footerLeft\"><p>©<script type=\"text/javascript\">copyright=new Date();update=copyright.getFullYear();document.write(update);</script>2021      Mark Russinovich.  All Rights Reserved.</p>    <p>Email: <a href=\"mailto:mark@russinovich.com\">mark@russinovich.com</a></p></div>";
		//Best practice, we should check for this empty case, but not really common, defeats purpose
		//string html2 = "<a href=\"https://trojanhorsethebook.com/wordpress/\">Home</a><div class=\"footerLeft\"><p>©<script type=\"text/javascript\">copyright=new Date();update=copyright.getFullYear();document.write(update);</script>2021      Mark Russinovich.  All Rights Reserved.</p>    <p>Email: <a href=\"mailto:mark@russinovich.com\"></a></p></div>";
		string html3 = "<div id=\"post\"><article class=\"article single\"><header class=\"article-header\"><h1>Contact</h1></header><div class=\"article-content\"><p>If you have any questions, comments or concerns, please email help@thurrott.com</p></div></article> </div>"; 
		Console.WriteLine(ExtractEmailswithInnerText(html));
		Console.WriteLine(ExtractEmailswithInnerText(html3));
	}
}

No comments:

Post a Comment