07
Jul
2006

URL parsing and manipulation in .NET

While .NET has a URI class it's not great. They don't expose the various parts as properties and you can't manipulate them either. Many projects I've seen (Subtext included) just try and manipulate them via strings which varying degrees of success.

Here's a C# .NET URL decoder that uses a regular expression I developed for performance based on a VBScript class I developed a while back. Comments and white space have been removed to keep it short.

using System;
using System.Collections.Specialized;
using System.Text.RegularExpressions;

public class URL : ICloneable, IComparable
{
    private const string schemeDecodeRegex = @"([^:]+):";
    private const string mailtoDecodeRegex = @"(mailto:)(([^@]+)@(.+))";
    private const string urlDecodeRegex = @"([^:]+)://(([^:@]+)(:([^@]+))?@)?([^:/?#]+)(:([d]+))?([^?#]+)?(\?([^#]+))?(#(.*))?";
    private URL baseUrl;
    private string scheme;
    private long port;
    private bool useDefaultPort;
    private string hostName;
    private string user;
    private string password;
    private string path;
    private NameValueCollection query;
    private string fragment;
    private bool relative;

    public URL() {
        Reset();
    }

    public URL(string url) {
        Reset();
        FullURL = url;
    }

    public URL(URL copyUrl) {
        Reset();
        CopyFrom(copyUrl);
    }

    public string Scheme {
        get { return scheme; }
        set { scheme = value.Trim(); }
    }

public long Port { get { return port; } set { port = value; useDefaultPort = false; } } public bool UseDefaultPort { get { return useDefaultPort; } set { useDefaultPort = value; } } public string User { get { return user; } set { user = value; } } public string Password { get { return password; } set { password = value; } } public string HostName { get { return hostName; } set { hostName = value; } } public string Path { get { return path; } set { path = value; } } public NameValueCollection Query { get { return query; } set { query = value; } }
public string Fragment { get { return fragment; } set { fragment = value; } }
public string FullURL { get { if (Scheme.Equals("mailto")) return string.Format("{0}:{1}@{2}", Scheme, User, HostName); string newURL = string.Empty; if (!Relative) { newURL += Scheme + "://"; if (User.Length > 0) { newURL += User; if (Password.Length > 0) newURL += ":" + Password; newURL += "@"; } newURL += HostName; if (!UseDefaultPort) newURL += ":" + Port; } newURL += Path; if (QueryString.Length > 0) newURL += "?" + QueryString; if (Fragment.Length > 0) newURL += "#" + Fragment; return newURL; } set { Reset(); Match m = new Regex(schemeDecodeRegex).Match(value); if (m.Success) if (m.Groups[1].Captures[0].Value.ToLower().Equals("mailto")) DecodeMailTo(value); else DecodeURL(value); } }
public bool Relative { get { return relative; } set { relative = value; } } public string QueryString { get { string newQueryString = string.Empty; for (int queryIdx = 0; queryIdx < Query.Count; queryIdx++) { newQueryString += (queryIdx == 0 ? "" : "&") + Query.Keys[queryIdx]; if (Query[queryIdx].Length > 0) newQueryString += "=" + Query[queryIdx]; } return newQueryString; } set { Query.Clear(); AppendQueryString(value); } } public URL BaseUrl { get { return baseUrl; } set { baseUrl = value; } }
public void AppendQueryString(string newQueryString) { string[] pairs = newQueryString.Split('&'); for (int pairIdx = 0; pairIdx < pairs.Length; pairIdx++) { string pair = pairs[pairIdx]; int keyPos = pair.IndexOf('='); if (keyPos > 0) { string key = pair.Substring(0, keyPos); string value = pair.Substring(keyPos + 1); query[key] = value; } else query[pair] = string.Empty; } } public void Reset() { Scheme = string.Empty; Port = 0; UseDefaultPort = true; HostName = string.Empty; User = string.Empty; Password = string.Empty; Path = string.Empty; Query = new NameValueCollection(); Fragment = string.Empty; Relative = false; } public void CopyFrom(URL copyUrl) { Scheme = copyUrl.Scheme; User = copyUrl.User; Password = copyUrl.Password; HostName = copyUrl.HostName; } public override bool Equals(object obj) { if (obj == null) return false; if (obj.GetType() != this.GetType()) return false; return (FullURL == ((URL) obj).FullURL); } public override int GetHashCode() { return FullURL.GetHashCode(); } public override string ToString() { return FullURL; } private void DecodeURL(string value) { Match m = new Regex(urlDecodeRegex).Match(value); if (m.Success) { if (m.Groups[1].Captures.Count == 1) Scheme = m.Groups[1].Captures[0].Value; if (m.Groups[4].Captures.Count == 1) User = m.Groups[4].Captures[0].Value; if (m.Groups[5].Captures.Count == 1) Password = m.Groups[5].Captures[0].Value; if (m.Groups[6].Captures.Count == 1) HostName = m.Groups[6].Captures[0].Value; if (m.Groups[8].Captures.Count == 1) Port = Int32.Parse(m.Groups[8].Captures[0].Value); if (m.Groups[9].Captures.Count == 1) Path = m.Groups[9].Captures[0].Value; if (m.Groups[11].Captures.Count == 1) QueryString = m.Groups[11].Captures[0].Value; if (m.Groups[13].Captures.Count == 1) Fragment = m.Groups[13].Captures[0].Value; } } private void DecodeMailTo(string value) { Match m = new Regex(mailtoDecodeRegex).Match(value); if (m.Success) { if (m.Groups[1].Captures.Count == 1) Scheme = m.Groups[1].Captures[0].Value; if (m.Groups[2].Captures.Count == 1) User = m.Groups[2].Captures[0].Value; if (m.Groups[3].Captures.Count == 1) HostName = m.Groups[3].Captures[0].Value; } } public object Clone() { URL newClone = (URL) this.MemberwiseClone(); newClone.Query = new NameValueCollection(Query); return newClone; } public int CompareTo(object obj) { if (obj == this) return 0; if (!(obj is URL)) return -1; return ((URL) obj).FullURL.CompareTo(FullURL); } }

[)amien


15 responses to “URL parsing and manipulation in .NET”


  1. Gravatar 1 Damien Guard Jul 10th, 2006 at 21:07

    Yeah sorry there are no usage samples - I did write a few but my blog software started activating the anti-spam on me :D
    To get the current URL:

    URL myURL = new URL(Page.Request.Url.ToString());

    You could remove all the query string with:

    myURL.QueryString = "";

    Alternatively if you just wanted to set one item - say a customerID you would normally have to hope it didn't exist and add it - taking note to append "&" or "?" depending on if there are other query values. With this class you can just;

    myURL.Query["CustomerID"] = "1";

    Oh, and to write our your hyperlink you'd do something like;

    nextCustomerLink.Href = myURL.FullURL;

    [)amien

  2. Gravatar 2 John Rummell Jul 10th, 2006 at 21:07

    I linked from DotNetKicks. This is cool. How would I use this to grab the URL without the QueryString?

  3. Gravatar 3 John Rummell Jul 10th, 2006 at 21:07

    Excellent. Thank you!

  4. Gravatar 4 Haacked Jul 17th, 2006 at 21:07

    You should consider making it Serializable. THat's one issue I've had with the existing Uri class.

  5. Gravatar 5 Haacked Jul 17th, 2006 at 21:07

    I just checked and Uri in .NET 2.0 is serializable.

  6. Gravatar 6 Damien Guard Jul 18th, 2006 at 14:07

    The Uri class in .NET is pretty useless. It doesn't include username/password properties, the query part is a string instead of a collection of name/value pairs etc.

    [)amien

  7. Gravatar 7 Fabian Jul 19th, 2006 at 11:07

    Great job, thanks!

  8. Gravatar 8 Mike Mar 17th, 2008 at 18:03
  9. Gravatar 9 Damien Guard Apr 11th, 2008 at 09:04

    @Mike: Well you could but if that's all you want you may as well just grab the string up to the final / with urlString.Substring(0,urlString.LastIndexOf('/')-1)

    [)amien

  10. Gravatar 10 Russ May 2nd, 2008 at 16:05

    This works well, but any attempts that I make to use an imported query string dont seem to work. I can build a querystring and export ok, but importing one fails.

    Is there a download of this as a .cs class file?

  11. Gravatar 11 Russ May 2nd, 2008 at 17:05

    I found the problem, it's in the regular expression.. Here's the new line.

    private const string urlDecodeRegex = @"([^:]+)://(([^:@]+)(:([^@]+))?@)?([^:/?#]+)(:([d]+))?([^?#]+)?(\?([^#]+))?(#(.*))?";

  12. Gravatar 12 Damien Guard May 3rd, 2008 at 10:05

    Thanks for spotting and figuring that out Ross - I must have broken it during the reformatting exercise when I switched to this theme.

    [)amien

  13. Gravatar 13 MrToes May 12th, 2008 at 16:05

    It's also remarkably intolerant of syntax problems. There's also a problem with the Uri class (in .NET 2.0 Compact Framework at least) where creating it with a string that has leading spaces causes memory corruption. Lovely. We have our own Uri class to fix this (and other) problems. Backward slashes resulting in FormatExceptions is another one that comes to mind. Quite possibly all this is fixed in new shiny .NET versions.

    So, how long is it till V-day? :)

  14. Gravatar 14 Ziv Rozzbach Oct 5th, 2008 at 15:10

    Nice, but it does not do one thing: splitting the host from the domain
    so if I enter reag123.afsda123.ukdomain.co.uk it will return me: ukdomain.co.uk

    Anyone have any idea how can I do it ?

  15. Gravatar 15 guitarpoet May 17th, 2009 at 22:05

    Hello, I think the code has a bug and I had rewrite it like this:
    private const string urlDecodeRegex = @"([^:]+)://(([^:@]+)(:([^@]+))?@)?([^:/?#]+)(:([\d]+))?([^?#]+)?(\?([^#]+))?(#(.*))?";

Leave a reply




Topics