Monday, June 30, 2008

Regular Expression Parse CSV Code

/// <summary>
/// Helper Class for tokenise CSV files using mainly regular expression
///
/// </summary>
public class CSVStringTokeniser
{
/// <summary>
/// This expression says
///if found double quote or single quote then //(?(?=[\x22\x27])
///{
/// match column and strip quotes and commas //(?:[\x22\x27]+)(?<column>[^\x22\x27]*)(?:[\x22\x27]+)
///}
///else
///{
/// match column and strip commas //(?<column>[^,\r\n]*))(?:,?)
///}
///
/// This expression have the problem of putting extra empty match at the end when not needed
/// </summary>
public static readonly string _expression =
@"((?(?=[\x22\x27])(?:[\x22\x27]+)(?<column>[^\x22\x27]*)(?:[\x22\x27]+)|(?<column>[^,\r\n]*))(?:,?))";

/// <summary>
/// this simple case one can not handle comman with in the column, e.g "abc,efg",hijk
/// </summary>
public static readonly string _expressionSimple =
@"[^,\x22\x27]*";

/// <summary>
/// this expression is similar to _expression but this one do not put extra empty match at the end
/// however this one can not handle consecutive empty columns e.g. abc,,,efg
/// </summary>
public static readonly string _expressionTrial =
@"(?(?=[\x22\x27])(?:[\x22\x27]+)(?<column>[^\x22\x27]*)(?:[\x22\x27]+)(?:,?)|((?<column>[^,\r\n]+)(?:,?)|(?<column>\W*)(?:,)))";

/// <summary>
/// static method to tokenise a CSV (comma separated value) string
/// The limitation of this method is that it can not handle quoted empty columns, e.g. "abc","","","efg"
/// </summary>
/// <param name="inputStr">CSV String</param>
/// <returns>list of values in the CSV with quotes and comma removed</returns>
public static List<string> Tokenise(string inputStr)
{
// here we use _expression because it is most generic and can handle most situation
Regex reg = new Regex(_expression);

List<string> rs = new List<string>();

foreach (Match match in reg.Matches(inputStr))
{
if (match.Success)
{
foreach (Capture capture in match.Groups["column"].Captures)
{
if (capture.Index < inputStr.Length)
rs.Add(capture.Value);
else // ignore last match if the end charater is not another comma, hence an empty column at the end
{
string lastCharacter = inputStr.Substring(inputStr.Length - 1, 1);
if (lastCharacter == ",")
{
rs.Add(string.Empty);
}
}
}
}
}
return rs;
}
}

1 comment:

Vineet Singla said...

Hi there, your post on parsing csv is a bit old so not sure if you still use it. I fixed the quoted empty column issue with your pattern. Here's the final regex-

((?(?=[\x22\x27])(?:[\x22\x27]+?)(?[^\x22\x27]*)(?:[\x22\x27]+?)|(?[^,\r\n]*))(?:,?))