Monday, October 19, 2020

C# .NET Get integer from hexadecimal string, many hex formats supported version 2
















The code below brackets the first likely hex number that matches, from the list of many hex formats that are specified with leading hex prefixes. Then it proceeds to remove prefixes and apply the TryParse functions. 

 
C# TryParse function with NumberStyles.HexNumber requires many hex prefixes to be removed first in order to works,  such as "0x". 

This will fail 
UInt32.TryParse("0x20", 
             NumberStyles.HexNumber, // AllowHexSpecifier - Strings that are parsed using this style cannot be prefixed with "0x" or "&h". 
             CultureInfo.InvariantCulture,  // I've also tried CurrentCulture
             out number));

//See https://docs.microsoft.com/en-us/dotnet/api/system.globalization.numberstyles?view=netcore-3.1#System_Globalization_NumberStyles_AllowHexSpecifier

Note: It is tempting to optimize first regex from
0x[0-9a-f]{2,}
to
0x([0-9a-f]{2,}) and use a group capture
but because we are capturing multiple expressions the overlap is troublesome. 


This is an update to my last post about this - 

C# .NET How to get integer from hexadecimal string, many hex formats supported


This code removes many hex formats prefixes as listed here - https://en.wikipedia.org/wiki/Hexadecimal


Source Code

using System;
using System.Globalization;
using System.Text.RegularExpressions;

public static class Program
{

    const string strRegHexPrefixCandidates = @"0x[0-9a-f]{2,}|%x[0-9a-f]{2,}|\\u[0-9a-f]{2,}|&#x([0-9a-f]){1,6};|&#([0-9a-f]){1,6};|\\x[0-9a-f]{2,}|\\s[0-9a-f]{2,}|U\+[0-9a-f]{2,}|X'[0-9a-f]{2,}|16#([0-9a-f]){2,}|#x([0-9a-f]){2,}|#16r([0-9a-f]){2,6}|&H([0-9a-f]){2,}|0h([0-9a-f]){2,}|#([0-9a-f]){1,6}|%[0-9a-f]{2,}";
    const string strRegGetHexNumber = @"[0-9a-f]{2,}|[«‹»›„‚“‟‘‛”’""""❛❜❝❞〝〞〟"""""'‘][0-9a-f]{2,}[’'""""«‹»›„‚“‟‘‛”’""""❛❜❝❞〝〞〟"]";
    private static readonly Regex rgxHexPre = new Regex(strRegHexPrefixCandidates, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant | RegexOptions.Compiled);
    private static readonly Regex rgxGetHexAgressive = new Regex(strRegGetHexNumber, RegexOptions.IgnoreCase | RegexOptions.CultureInvariant | RegexOptions.Compiled);
    public static void Main()
    {
        //string unicodeText = "UTF-16 (hex)	0x0023 (0023)";
        string unicodeText = @"In XML and XHTML, characters can be expressed as hexadecimal numeric character references using the notation &#xcode;, for instance ’ represents the character U+2019 (the right single quotation mark). If there is no x the number is decimal (thus ’ is the same character).[3]";
        //string unicodeText = "8E2";
        //string unicodeText = "this is the end";

        string firstCandidateHexVal = string.Empty;
        
        //https://en.wikipedia.org/wiki/Hexadecimal --remove possible prefixes
        foreach (Match p in rgxHexPre.Matches(unicodeText))
        {
            if (p.Success) { 
                firstCandidateHexVal = p.Value;
                break;
            }
        }

        string prefixfree = string.Empty;
        if (!string.IsNullOrEmpty(firstCandidateHexVal)) 
        { 
            //same prefixes as in Regex
            string[] prefixHexs = new string[] { "0x", "%x", "\\u", "&#x", "&#", "\\x", "\\s", "U+", "X'", "16#", "#x", "#16r", "&H", "0h", "#", "%" };
            foreach (var pre in prefixHexs)
            {
                if (firstCandidateHexVal.IndexOf(pre) > -1)
                {
                    prefixfree = firstCandidateHexVal.Substring(firstCandidateHexVal.IndexOf(pre) + pre.Length);
                    break;
                }
            }
        }

        string finalHexCandy = string.Empty;

        if (string.IsNullOrEmpty(prefixfree))
            finalHexCandy = unicodeText;
        else  
            finalHexCandy = prefixfree;

        Match m = rgxGetHexAgressive.Match(finalHexCandy);
        bool success = false;
        ulong number = 0;
        string hex_value = string.Empty;
        
        if (m.Success)
        {
            try
            {
                hex_value = m.Value;
                //long number = Convert.ToInt64(hex_value, 16); //base 16 - hex....
                //https://stackoverflow.com/questions/2801509/uint32-tryparse-hex-number-not-working -> remove prefixes
                success = ulong.TryParse(hex_value, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out number);
            }
            catch (FormatException)
            {
                unicodeText = string.Format("{0} is not in the correct format for a hexadecimal number.", m.Value);
            }
            catch (OverflowException)
            {
                unicodeText = string.Format("{0} is outside the range of an Int64.", m.Value);
            }
            catch (ArgumentException)
            {
                unicodeText = string.Format("{0} is invalid in base 16.", m.Value);
            }
            catch (Exception ex)
            {
                unicodeText = string.Format("{0} return error\r\n{2}", m.Value, ex.Message);
            }
        }
        else
            unicodeText = "Could not find a hex number in \"" + unicodeText + "\". Select the hex number only.";



        if (!string.IsNullOrEmpty(firstCandidateHexVal))
            hex_value = firstCandidateHexVal; 

        
        if (success)
            unicodeText = string.Format("{0} integer from found {1:N0} hex number in string: {2}", number, hex_value, unicodeText);
        else
            unicodeText = "Could not find a hex number in string: \"" + unicodeText + "\". Select the hex number only.";
        
        Console.WriteLine(unicodeText);
 
    }
}

No comments:

Post a Comment