What is the fastest way to replace extra white spaces to one white space?
e.g.
from
foo bar
to
foo bar
What is the fastest way to replace extra white spaces to one white space?
e.g.
from
foo bar
to
foo bar
The fastest way? Iterate over the string and build a second copy in a StringBuilder
character by character, only copying one space for each group of spaces.
The easier to type Replace
variants will create a bucket load of extra strings (or waste time building the regex DFA).
Edit with comparison results:
Using http://ideone.com/h6pw3, with n=50 (had to reduce it on ideone because it took so long they had to kill my process), I get:
Regex: 7771ms.
Stringbuilder: 894ms.
Which is indeed as expected, Regex
is horribly inefficient for something this simple.
You can use a regex:
static readonly Regex trimmer = new Regex(@"\s\s+");
s = trimmer.Replace(s, " ");
For added performance, pass RegexOptions.Compiled
.
A bit late, but I have done some benchmarking to get the fastest way to remove extra whitespaces. If there are any faster answers, I would love to add them.
Results:
Code:
public class RemoveExtraWhitespaces
{
public static string WithRegex(string text)
{
return Regex.Replace(text, @"\s+", " ");
}
public static string WithRegexCompiled(Regex compiledRegex, string text)
{
return compiledRegex.Replace(text, " ");
}
public static string NormalizeWhiteSpace(string input)
{
if (string.IsNullOrEmpty(input))
return string.Empty;
int current = 0;
char[] output = new char[input.Length];
bool skipped = false;
foreach (char c in input.ToCharArray())
{
if (char.IsWhiteSpace(c))
{
if (!skipped)
{
if (current > 0)
output[current++] = ' ';
skipped = true;
}
}
else
{
skipped = false;
output[current++] = c;
}
}
return new string(output, 0, current);
}
public static string NormalizeWhiteSpaceForLoop(string input)
{
int len = input.Length,
index = 0,
i = 0;
var src = input.ToCharArray();
bool skip = false;
char ch;
for (; i < len; i++)
{
ch = src[i];
switch (ch)
{
case '\u0020':
case '\u00A0':
case '\u1680':
case '\u2000':
case '\u2001':
case '\u2002':
case '\u2003':
case '\u2004':
case '\u2005':
case '\u2006':
case '\u2007':
case '\u2008':
case '\u2009':
case '\u200A':
case '\u202F':
case '\u205F':
case '\u3000':
case '\u2028':
case '\u2029':
case '\u0009':
case '\u000A':
case '\u000B':
case '\u000C':
case '\u000D':
case '\u0085':
if (skip) continue;
src[index++] = ch;
skip = true;
continue;
default:
skip = false;
src[index++] = ch;
continue;
}
}
return new string(src, 0, index);
}
}
Tests:
[TestFixture]
public class RemoveExtraWhitespacesTest
{
private const string _text = "foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo ";
private const string _expected = "foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo foo bar foobar moo ";
private const int _iterations = 10000;
[Test]
public void Regex()
{
var result = TimeAction("Regex", () => RemoveExtraWhitespaces.WithRegex(_text));
Assert.AreEqual(_expected, result);
}
[Test]
public void RegexCompiled()
{
var compiledRegex = new Regex(@"\s+", RegexOptions.Compiled);
var result = TimeAction("RegexCompiled", () => RemoveExtraWhitespaces.WithRegexCompiled(compiledRegex, _text));
Assert.AreEqual(_expected, result);
}
[Test]
public void NormalizeWhiteSpace()
{
var result = TimeAction("NormalizeWhiteSpace", () => RemoveExtraWhitespaces.NormalizeWhiteSpace(_text));
Assert.AreEqual(_expected, result);
}
[Test]
public void NormalizeWhiteSpaceForLoop()
{
var result = TimeAction("NormalizeWhiteSpaceForLoop", () => RemoveExtraWhitespaces.NormalizeWhiteSpaceForLoop(_text));
Assert.AreEqual(_expected, result);
}
public string TimeAction(string name, Func<string> func)
{
var timer = Stopwatch.StartNew();
string result = string.Empty; ;
for (int i = 0; i < _iterations; i++)
{
result = func();
}
timer.Stop();
Console.WriteLine(string.Format("{0}: {1} ms", name, timer.ElapsedMilliseconds));
return result;
}
}
I use below methods - they handle all whitespace chars not only spaces, trim both leading and trailing whitespaces, remove extra whitespaces, and all whitespaces are replaced to space char (so we have uniform space separator). And these methods are fast.
public static String CompactWhitespaces( String s )
{
StringBuilder sb = new StringBuilder( s );
CompactWhitespaces( sb );
return sb.ToString();
}
public static void CompactWhitespaces( StringBuilder sb )
{
if( sb.Length == 0 )
return;
// set [start] to first not-whitespace char or to sb.Length
int start = 0;
while( start < sb.Length )
{
if( Char.IsWhiteSpace( sb[ start ] ) )
start++;
else
break;
}
// if [sb] has only whitespaces, then return empty string
if( start == sb.Length )
{
sb.Length = 0;
return;
}
// set [end] to last not-whitespace char
int end = sb.Length - 1;
while( end >= 0 )
{
if( Char.IsWhiteSpace( sb[ end ] ) )
end--;
else
break;
}
// compact string
int dest = 0;
bool previousIsWhitespace = false;
for( int i = start; i <= end; i++ )
{
if( Char.IsWhiteSpace( sb[ i ] ) )
{
if( !previousIsWhitespace )
{
previousIsWhitespace = true;
sb[ dest ] = ' ';
dest++;
}
}
else
{
previousIsWhitespace = false;
sb[ dest ] = sb[ i ];
dest++;
}
}
sb.Length = dest;
}
string text = "foo bar";
text = Regex.Replace(text, @"\s+", " ");
// text = "foo bar"
This solution works with spaces, tabs, and newline. If you want just spaces, replace '\s' with ' '.
string q = " Hello how are you doing?";
string a = String.Join(" ", q.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries));
I needed one of these for larger strings and came up with the routine below.
Any consecutive white-space (including tabs, newlines) is replaced with whatever is in normalizeTo
.
Leading/trailing white-space is removed.
It's around 8 times faster than a RegEx with my 5k->5mil char strings.
internal static string NormalizeWhiteSpace(string input, char normalizeTo = ' ')
{
if (string.IsNullOrEmpty(input))
return string.Empty;
int current = 0;
char[] output = new char[input.Length];
bool skipped = false;
foreach (char c in input.ToCharArray())
{
if (char.IsWhiteSpace(c))
{
if (!skipped)
{
if (current > 0)
output[current++] = normalizeTo;
skipped = true;
}
}
else
{
skipped = false;
output[current++] = c;
}
}
return new string(output, 0, skipped ? current - 1 : current);
}
string yourWord = "beep boop baap beep boop baap beep";
yourWord = yourWord .Replace(" ", " |").Replace("| ", "").Replace("|", "");
I've tried using StringBuilder to:
Here's the best balance of performance & readability I've found (using 100,000 iteration timing runs). Sometimes this tests faster than a less-legible version, at most 5% slower. On my small test string, regex takes 4.24x as much time.
public static string RemoveExtraWhitespace(string str)
{
var sb = new StringBuilder();
var prevIsWhitespace = false;
foreach (var ch in str)
{
var isWhitespace = char.IsWhiteSpace(ch);
if (prevIsWhitespace && isWhitespace)
{
continue;
}
sb.Append(ch);
prevIsWhitespace = isWhitespace;
}
return sb.ToString();
}
It's not fast, but if simplicity helps, this works:
while (text.Contains(" ")) text=text.Replace(" ", " ");
try this:
System.Text.RegularExpressions.Regex.Replace(input, @"\s+", " ");
A few requirements are not clear in this question which deserve some thought.
This is a very efficient version which replaces all white space with a single space and removes any leading and trailing white space prior to the for loop.
public static string WhiteSpaceToSingleSpaces(string input)
{
if (input.Length < 2)
return input;
StringBuilder sb = new StringBuilder();
input = input.Trim();
char lastChar = input[0];
bool lastCharWhiteSpace = false;
for (int i = 1; i < input.Length; i++)
{
bool whiteSpace = char.IsWhiteSpace(input[i]);
//Skip duplicate whitespace characters
if (whiteSpace && lastCharWhiteSpace)
continue;
//Replace all whitespace with a single space.
if (whiteSpace)
sb.Append(' ');
else
sb.Append(input[i]);
//Keep track of the last character's whitespace status
lastCharWhiteSpace = whiteSpace;
}
return sb.ToString();
}
This piece of code works good. I have not measure the performance.
string text = " hello - world, here we go !!! a bc ";
string.Join(" ", text.Split().Where(x => x != ""));
// Output
// "hello - world, here we go !!! a bc"
you could use indexOf to first grab where the whitespace sequences start, then use replace method to change the white space to "". From there, you can use the index you grabbed and place one whitespace character in that spot.
This is funny, but on my PC the below method is just as fast as Sergey Povalyaev's StringBulder approach - (~282ms for 1000 reps, 10k src strings). Not sure about memory usage though.
string RemoveExtraWhiteSpace(string src, char[] wsChars){
return string.Join(" ",src.Split(wsChars, StringSplitOptions.RemoveEmptyEntries));
}
Obviously it works okay with any chars - not just spaces.
Though this is not what the OP asked for - but if what you really need is to replace specific consecutive characters in a string with only one instance you can use this relatively efficient method:
string RemoveDuplicateChars(string src, char[] dupes){
var sd = (char[])dupes.Clone();
Array.Sort(sd);
var res = new StringBuilder(src.Length);
for(int i = 0; i<src.Length; i++){
if( i==0 || src[i]!=src[i-1] || Array.BinarySearch(sd,src[i])<0){
res.Append(src[i]);
}
}
return res.ToString();
}
public string GetCorrectString(string IncorrectString)
{
string[] strarray = IncorrectString.Split(' ');
var sb = new StringBuilder();
foreach (var str in strarray)
{
if (str != string.Empty)
{
sb.Append(str).Append(' ');
}
}
return sb.ToString().Trim();
}
I just whipped this up, haven't tested it yet though. But I felt this was elegant, and avoids regex:
/// <summary>
/// Removes extra white space.
/// </summary>
/// <param name="s">
/// The string
/// </param>
/// <returns>
/// The string, with only single white-space groupings.
/// </returns>
public static string RemoveExtraWhiteSpace(this string s)
{
if (s.Length == 0)
{
return string.Empty;
}
var stringBuilder = new StringBuilder();
var whiteSpaceCount = 0;
foreach (var character in s)
{
if (char.IsWhiteSpace(character))
{
whiteSpaceCount++;
}
else
{
whiteSpaceCount = 0;
}
if (whiteSpaceCount > 1)
{
continue;
}
stringBuilder.Append(character);
}
return stringBuilder.ToString();
}
Am I missing something here? I came up with this:
// Input: "HELLO BEAUTIFUL WORLD!"
private string NormalizeWhitespace(string inputStr)
{
// First split the string on the spaces but exclude the spaces themselves
// Using the input string the length of the array will be 3. If the spaces
// were not filtered out they would be included in the array
var splitParts = inputStr.Split(' ').Where(x => x != "").ToArray();
// Now iterate over the parts in the array and add them to the return
// string. If the current part is not the last part, add a space after.
for (int i = 0; i < splitParts.Count(); i++)
{
retVal += splitParts[i];
if (i != splitParts.Count() - 1)
{
retVal += " ";
}
}
return retVal;
}
// Would return "HELLO BEAUTIFUL WORLD!"
I know I am creating a second string here to return it as well as creating the splitParts array. Just figured this is pretty straight forward. Maybe I am not taking into account some of the potential scenarios.
I know this is really old, but the easiest way to compact whitespace (replace any recurring whitespace character with a single "space" character) is as follows:
public static string CompactWhitespace(string astring)
{
if (!string.IsNullOrEmpty(astring))
{
bool found = false;
StringBuilder buff = new StringBuilder();
foreach (char chr in astring.Trim())
{
if (char.IsWhiteSpace(chr))
{
if (found)
{
continue;
}
found = true;
buff.Append(' ');
}
else
{
if (found)
{
found = false;
}
buff.Append(chr);
}
}
return buff.ToString();
}
return string.Empty;
}
public static string RemoveExtraSpaces(string input)
{
input = input.Trim();
string output = "";
bool WasLastCharSpace = false;
for (int i = 0; i < input.Length; i++)
{
if (input[i] == ' ' && WasLastCharSpace)
continue;
WasLastCharSpace = input[i] == ' ';
output += input[i];
}
return output;
}
For those who just want to copy-pase and go on:
private string RemoveExcessiveWhitespace(string value)
{
if (value == null) { return null; }
var builder = new StringBuilder();
var ignoreWhitespace = false;
foreach (var c in value)
{
if (!ignoreWhitespace || c != ' ')
{
builder.Append(c);
}
ignoreWhitespace = c == ' ';
}
return builder.ToString();
}
There is no need for complex code! Here is a simple code that will remove any duplicates:
public static String RemoveCharOccurence(String s, char[] remove)
{
String s1 = s;
foreach(char c in remove)
{
s1 = RemoveCharOccurence(s1, c);
}
return s1;
}
public static String RemoveCharOccurence(String s, char remove)
{
StringBuilder sb = new StringBuilder(s.Length);
Boolean removeNextIfMatch = false;
foreach(char c in s)
{
if(c == remove)
{
if(removeNextIfMatch)
continue;
else
removeNextIfMatch = true;
}
else
removeNextIfMatch = false;
sb.Append(c);
}
return sb.ToString();
}
It's very simple, just use the .Replace()
method:
string words = "Hello world!";
words = words.Replace("\\s+", " ");
Output >>> "Hello world!"