URL parsing and manipulation in .NET

July 7th 2006 • .NET (, , ) • 7,696 views • 16 responses

While .NET has a URI class it’s not great. They don’t expose the various parts as properties and you can’t manipulate them either. Many projects I’ve seen (Subtext included) just try and manipulate them via strings which varying degrees of success.

Here’s a C# .NET URL decoder that uses a regular expression I developed for performance based on a VBScript class I developed a while back. Comments and white space have been removed to keep it short.

using System;
using System.Collections.Specialized;
using System.Text.RegularExpressions;

public class URL : ICloneable, IComparable
{
    private const string schemeDecodeRegex = @"([^:]+):";
    private const string mailtoDecodeRegex = @"(mailto:)(([^@]+)@(.+))";
    private const string urlDecodeRegex = @"([^:]+)://(([^:@]+)(:([^@]+))?@)?([^:/?#]+)(:([d]+))?([^?#]+)?(\?([^#]+))?(#(.*))?";
    private URL baseUrl;
    private string scheme;
    private long port;
    private bool useDefaultPort;
    private string hostName;
    private string user;
    private string password;
    private string path;
    private NameValueCollection query;
    private string fragment;
    private bool relative;

    public URL() {
        Reset();
    }

    public URL(string url) {
        Reset();
        FullURL = url;
    }

    public URL(URL copyUrl) {
        Reset();
        CopyFrom(copyUrl);
    }

    public string Scheme {
        get { return scheme; }
        set { scheme = value.Trim(); }
    }

public long Port { get { return port; } set { port = value; useDefaultPort = false; } } public bool UseDefaultPort { get { return useDefaultPort; } set { useDefaultPort = value; } } public string User { get { return user; } set { user = value; } } public string Password { get { return password; } set { password = value; } } public string HostName { get { return hostName; } set { hostName = value; } } public string Path { get { return path; } set { path = value; } } public NameValueCollection Query { get { return query; } set { query = value; } }
public string Fragment { get { return fragment; } set { fragment = value; } }
public string FullURL { get { if (Scheme.Equals("mailto")) return string.Format("{0}:{1}@{2}", Scheme, User, HostName); string newURL = string.Empty; if (!Relative) { newURL += Scheme + "://"; if (User.Length > 0) { newURL += User; if (Password.Length > 0) newURL += ":" + Password; newURL += "@"; } newURL += HostName; if (!UseDefaultPort) newURL += ":" + Port; } newURL += Path; if (QueryString.Length > 0) newURL += "?" + QueryString; if (Fragment.Length > 0) newURL += "#" + Fragment; return newURL; } set { Reset(); Match m = new Regex(schemeDecodeRegex).Match(value); if (m.Success) if (m.Groups[1].Captures[0].Value.ToLower().Equals("mailto")) DecodeMailTo(value); else DecodeURL(value); } }
public bool Relative { get { return relative; } set { relative = value; } } public string QueryString { get { string newQueryString = string.Empty; for (int queryIdx = 0; queryIdx < Query.Count; queryIdx++) { newQueryString += (queryIdx == 0 ? "" : "&") + Query.Keys[queryIdx]; if (Query[queryIdx].Length > 0) newQueryString += "=" + Query[queryIdx]; } return newQueryString; } set { Query.Clear(); AppendQueryString(value); } } public URL BaseUrl { get { return baseUrl; } set { baseUrl = value; } }
public void AppendQueryString(string newQueryString) { string[] pairs = newQueryString.Split('&'); for (int pairIdx = 0; pairIdx < pairs.Length; pairIdx++) { string pair = pairs[pairIdx]; int keyPos = pair.IndexOf('='); if (keyPos > 0) { string key = pair.Substring(0, keyPos); string value = pair.Substring(keyPos + 1); query[key] = value; } else query[pair] = string.Empty; } } public void Reset() { Scheme = string.Empty; Port = 0; UseDefaultPort = true; HostName = string.Empty; User = string.Empty; Password = string.Empty; Path = string.Empty; Query = new NameValueCollection(); Fragment = string.Empty; Relative = false; } public void CopyFrom(URL copyUrl) { Scheme = copyUrl.Scheme; User = copyUrl.User; Password = copyUrl.Password; HostName = copyUrl.HostName; } public override bool Equals(object obj) { if (obj == null) return false; if (obj.GetType() != this.GetType()) return false; return (FullURL == ((URL) obj).FullURL); } public override int GetHashCode() { return FullURL.GetHashCode(); } public override string ToString() { return FullURL; } private void DecodeURL(string value) { Match m = new Regex(urlDecodeRegex).Match(value); if (m.Success) { if (m.Groups[1].Captures.Count == 1) Scheme = m.Groups[1].Captures[0].Value; if (m.Groups[4].Captures.Count == 1) User = m.Groups[4].Captures[0].Value; if (m.Groups[5].Captures.Count == 1) Password = m.Groups[5].Captures[0].Value; if (m.Groups[6].Captures.Count == 1) HostName = m.Groups[6].Captures[0].Value; if (m.Groups[8].Captures.Count == 1) Port = Int32.Parse(m.Groups[8].Captures[0].Value); if (m.Groups[9].Captures.Count == 1) Path = m.Groups[9].Captures[0].Value; if (m.Groups[11].Captures.Count == 1) QueryString = m.Groups[11].Captures[0].Value; if (m.Groups[13].Captures.Count == 1) Fragment = m.Groups[13].Captures[0].Value; } } private void DecodeMailTo(string value) { Match m = new Regex(mailtoDecodeRegex).Match(value); if (m.Success) { if (m.Groups[1].Captures.Count == 1) Scheme = m.Groups[1].Captures[0].Value; if (m.Groups[2].Captures.Count == 1) User = m.Groups[2].Captures[0].Value; if (m.Groups[3].Captures.Count == 1) HostName = m.Groups[3].Captures[0].Value; } } public object Clone() { URL newClone = (URL) this.MemberwiseClone(); newClone.Query = new NameValueCollection(Query); return newClone; } public int CompareTo(object obj) { if (obj == this) return 0; if (!(obj is URL)) return -1; return ((URL) obj).FullURL.CompareTo(FullURL); } }

[)amien

Related content

16 responses  

  1. Damien Guard on July 10th, 2006

    Yeah sorry there are no usage samples – I did write a few but my blog software started activating the anti-spam on me :D
    To get the current URL:

    URL myURL = new URL(Page.Request.Url.ToString());

    You could remove all the query string with:

    myURL.QueryString = "";

    Alternatively if you just wanted to set one item – say a customerID you would normally have to hope it didn’t exist and add it – taking note to append “&” or “?” depending on if there are other query values. With this class you can just;

    myURL.Query["CustomerID"] = "1";

    Oh, and to write our your hyperlink you’d do something like;

    nextCustomerLink.Href = myURL.FullURL;

    [)amien

  2. John Rummell on July 10th, 2006

    I linked from DotNetKicks. This is cool. How would I use this to grab the URL without the QueryString?

  3. Haacked on July 17th, 2006

    You should consider making it Serializable. THat’s one issue I’ve had with the existing Uri class.

  4. Haacked on July 17th, 2006

    I just checked and Uri in .NET 2.0 is serializable.

  5. Damien Guard on July 18th, 2006

    The Uri class in .NET is pretty useless. It doesn’t include username/password properties, the query part is a string instead of a collection of name/value pairs etc.

    [)amien

  6. Fabian on July 19th, 2006

    Great job, thanks!

  7. Mike on March 17th, 2008

    If I had this: http://damieng.com/blog/2006/07/07/URL_parsing_and_manipulation_in_.NET

    How could I get just this part?
    http://damieng.com/blog/2006/07/07

  8. Damien Guard on April 11th, 2008

    @Mike: Well you could but if that’s all you want you may as well just grab the string up to the final / with urlString.Substring(0,urlString.LastIndexOf(‘/’)-1)

    [)amien

  9. Russ on May 2nd, 2008

    This works well, but any attempts that I make to use an imported query string dont seem to work. I can build a querystring and export ok, but importing one fails.

    Is there a download of this as a .cs class file?

  10. Russ on May 2nd, 2008

    I found the problem, it’s in the regular expression.. Here’s the new line.

    private const string urlDecodeRegex = @”([^:]+)://(([^:@]+)(:([^@]+))?@)?([^:/?#]+)(:([d]+))?([^?#]+)?(\?([^#]+))?(#(.*))?”;

  11. Damien Guard on May 3rd, 2008

    Thanks for spotting and figuring that out Ross – I must have broken it during the reformatting exercise when I switched to this theme.

    [)amien

  12. MrToes on May 12th, 2008

    It’s also remarkably intolerant of syntax problems. There’s also a problem with the Uri class (in .NET 2.0 Compact Framework at least) where creating it with a string that has leading spaces causes memory corruption. Lovely. We have our own Uri class to fix this (and other) problems. Backward slashes resulting in FormatExceptions is another one that comes to mind. Quite possibly all this is fixed in new shiny .NET versions.

    So, how long is it till V-day? :)

  13. Ziv Rozzbach on October 5th, 2008

    Nice, but it does not do one thing: splitting the host from the domain
    so if I enter reag123.afsda123.ukdomain.co.uk it will return me: ukdomain.co.uk

    Anyone have any idea how can I do it ?

  14. guitarpoet on May 17th, 2009

    Hello, I think the code has a bug and I had rewrite it like this:
    private const string urlDecodeRegex = @”([^:]+)://(([^:@]+)(:([^@]+))?@)?([^:/?#]+)(:([\d]+))?([^?#]+)?(\?([^#]+))?(#(.*))?”;

  15. Michael on April 10th, 2010

    I’ve been collecting versions of different solutions that parse URLS and unfortunately, this one is by far the worst performing solution. Uri for all its weaknesses can process a million encode/decodes in about .4 seconds in my test environment. The second best solution which breaks more out than yours here is about 3 times slower than that but its versatile but I’m looking for something better performing still. But the regex solution above takes about a minute to process a million encode/decodes. That tracks to about 142 times slower. Sorry. Regex is a poor solution for parsing on the serverside.

  16. Damien Guard on April 10th, 2010

    It’s a good solution for general parsing operations even in a server environment where you want to do things like manipulate page parameters etc. (which is what it was originally designed for)

    Any regex based parsing will be less well performing than a hand-coded parser – this applies to any parsing not just this one or url parsing.

    [)amien

Leave your response

  1. (kept private)