Splitting Camel Case with RegEx
Phil posted some code to Split Pascal/Camel Cased Strings a few days ago. We had an offline discussion on doing this via RegEx.
I like the RegEx approach since it's only one line of code:
input,
"([A-Z])",
" $1",
System.Text.RegularExpressions.RegexOptions.Compiled).Trim();
This matches all capital letters, replaces them with a space and the letter we found ($1), then trims the result to remove the initial space if there was a capital letter at the beginning.
So, which would you use?
Arguments for Phil's C# approach:
- Easier for other programmers to read - not everyone knows RegEx
- Faster (see comparison below)
- All compiled code, so errors are more likely to be caught in development
Arguments for my RegEx approach:
- Simpler (in my opinion)
- RegEx is a string, so it can be put in a configuration file
So, let's compare performance. Now, this is mostly academic since this kind of function would likely be called less than 25 times, but still worth a look. Here are the sample string "SampleSplitText":
Approach | Repetitions | Time (seconds) |
RegEx Replace | 1000 | .0312500 |
RegEx Replace | 100000 | .3125000 |
RegEx Replace | 10000000 | 29.1562500 |
Code Approach | 1000 | 0 (not measurable) |
Code Approach | 100000 | .0156250 |
Code Approach | 10000000 | 1.6562500 |
RegEx Delegate Replace | 1000 | 0 (not measurable) |
RegEx Delegate Replace | 100000 | .0937500 |
RegEx Delegate Replace | 10000000 | 7.5000000 |
The only reason for calling this out is to show the exceptionally slow performance of the RegEx replace method for high iterations. For under a thousand iterations, I'd definitely go with the RegEx replace. For high repetitions, I'd consider using a RegEx replace with a MatchEvaluator delegate (see the code below). For my very simple test, it was just about as fast for anything under 100000 repetitions.
(updated - fixed a code error with delegate method)
using System.Collections;
using System.Collections.Specialized;
using System.Text.RegularExpressions;
public class SplitTest
{
public static void Main()
{
string input;
int iterations;
for(;;)
{
Console.WriteLine("Enter CamelCase text to split (defaults to SampleSplitText):");
input = Console.ReadLine();
if(input==string.Empty)
input="SampleSplitText";
iterations = 0;
Console.WriteLine("Enter number of operations ( enter 0 to quit):");
try
{
iterations = int.Parse(Console.ReadLine());
}
catch
{
Console.WriteLine("Exiting");
break;
}
if(iterations==0)
break;
System.DateTime start;
start = System.DateTime.Now;
Console.WriteLine(string.Format("Output from Inline RegEx approach: {0}", InlineRegExTest(input, iterations)));
Console.WriteLine(string.Format("Inline RegEx approach took {0} seconds for {1} iterations.",System.DateTime.Now-start,iterations));
start = System.DateTime.Now;
Console.WriteLine(string.Format("Output from RegEx / MatchEvaluator approach: {0}", DelegateRegExTest(input, iterations)));
Console.WriteLine(string.Format("RegEx / MatchEvaluator approach took {0} seconds for {1} iterations.",System.DateTime.Now-start,iterations));
start = System.DateTime.Now;
Console.WriteLine(string.Format("Output from Code approach: {0}", CodeTest(input, iterations)));
Console.WriteLine(string.Format("Code approach took {0} seconds for {1} iterations.",System.DateTime.Now-start,iterations));
Console.ReadLine();
}
}
private static string InlineRegExTest(string input, int iterations)
{
string output = "Failed";
for(int i=0;i<iterations;i++)
{
output = System.Text.RegularExpressions.Regex.Replace(input,"([A-Z])"," $1",System.Text.RegularExpressions.RegexOptions.Compiled).Trim();
}
return output;
}
private static string DelegateRegExTest(string input, int iterations)
{
System.Text.RegularExpressions.RegexOptions options = System.Text.RegularExpressions.RegexOptions.Compiled;
Regex reg = new Regex("(?<Word>[A-Z])",options);
string output = "Failed";
for(int i=0;i<iterations;i++)
{
output = reg.Replace( input, new MatchEvaluator( FormatWord ) ) ;
}
return output;
}
private static string FormatWord(Match m)
{
if( m.Groups["Word"].Success )
{
string word = m.Groups["Word"].Value ;
return " " + word;
}
else
return m.Value ;
}
private static string CodeTest(string input, int iterations)
{
string output = "Failed";
for(int i=0;i<iterations;i++)
{
output = SplitUpperCaseToString(input);
}
return output;
}
/// <summary>
/// Parses a camel cased or pascal cased string and returns a new
/// string with spaces between the words in the string.
/// </summary>
/// <example>
/// The string "PascalCasing" will return an array with two
/// elements, "Pascal" and "Casing".
/// </example>
/// <param name="source"></param>
/// <returns></returns>
public static string SplitUpperCaseToString(string source)
{
return string.Join(" ", SplitUpperCase(source));
}
/// <summary>
/// Parses a camel cased or pascal cased string and returns an array
/// of the words within the string.
/// </summary>
/// <example>
/// The string "PascalCasing" will return an array with two
/// elements, "Pascal" and "Casing".
/// </example>
/// <param name="source"></param>
/// <returns></returns>
public static string[] SplitUpperCase(string source)
{
if(source == null)
return new string[] {}; //Return empty array.
if(source.Length == 0)
return new string[] {""};
StringCollection words = new StringCollection();
int wordStartIndex = 0;
char[] letters = source.ToCharArray();
// Skip the first letter. we don't care what case it is.
for(int i = 1; i < letters.Length; i++)
{
if(char.IsUpper(letters[i]))
{
//Grab everything before the current index.
words.Add(new String(letters, wordStartIndex, i - wordStartIndex));
wordStartIndex = i;
}
}
//We need to have the last word.
words.Add(new String(letters, wordStartIndex, letters.Length - wordStartIndex));
//Copy to a string array.
string[] wordArray = new string[words.Count];
words.CopyTo(wordArray, 0);
return wordArray;
}
}