Fix "Sort Lines as Integer" not considering negative number

The current implementation is a variant of Natural Sort, which takes in account of negative numbers.

Fix #11023, fix #2025, close #11481
This commit is contained in:
wiseyestudio00 2022-04-06 02:07:47 -04:00 committed by Don Ho
parent a16f08468e
commit 721f994df8
2 changed files with 264 additions and 45 deletions

View File

@ -161,18 +161,13 @@ public:
} }
}; };
// Treat consecutive numerals as one number class IntegerSorter : public ISorter
// Otherwise it is a lexicographic sort
class NaturalSorter : public ISorter
{ {
public: public:
NaturalSorter(bool isDescending, size_t fromColumn, size_t toColumn) : ISorter(isDescending, fromColumn, toColumn) { }; IntegerSorter(bool isDescending, size_t fromColumn, size_t toColumn) : ISorter(isDescending, fromColumn, toColumn) { };
std::vector<generic_string> sort(std::vector<generic_string> lines) override std::vector<generic_string> sort(std::vector<generic_string> lines) override
{ {
// Note that both branches here are equivalent in the sense that they always give the same answer.
// However, if we are *not* sorting specific columns, then we get a 40% speed improvement by not calling
// getSortKey() so many times.
if (isSortingSpecificColumns()) if (isSortingSpecificColumns())
{ {
std::sort(lines.begin(), lines.end(), [this](generic_string aIn, generic_string bIn) std::sort(lines.begin(), lines.end(), [this](generic_string aIn, generic_string bIn)
@ -181,43 +176,152 @@ public:
generic_string b = getSortKey(bIn); generic_string b = getSortKey(bIn);
long long compareResult = 0; long long compareResult = 0;
size_t i = 0; size_t aNumIndex = 0;
size_t bNumIndex = 0;
while (compareResult == 0) while (compareResult == 0)
{ {
if (i >= a.length() || i >= b.length()) if (aNumIndex >= a.length() || bNumIndex >= b.length())
{ {
compareResult = a.compare(min(i, a.length()), generic_string::npos, b, min(i, b.length()), generic_string::npos); compareResult = a.compare(min(aNumIndex, a.length()), generic_string::npos, b, min(bNumIndex, b.length()), generic_string::npos);
break; break;
} }
bool aChunkIsNum = a[i] >= L'0' && a[i] <= L'9'; bool aChunkIsNum = a[aNumIndex] >= L'0' && a[aNumIndex] <= L'9';
bool bChunkIsNum = b[i] >= L'0' && b[i] <= L'9'; bool bChunkIsNum = b[bNumIndex] >= L'0' && b[bNumIndex] <= L'9';
int aNumSign = 1;
// Could be start of negative number
if (!aChunkIsNum && (aNumIndex + 1) < a.length())
{
aChunkIsNum = (a[aNumIndex] == L'-' && (a[aNumIndex + 1] >= L'0' && a[aNumIndex + 1] <= L'9'));
aNumSign = -1;
}
int bNumSign = 1;
if (!bChunkIsNum && (bNumIndex + 1) < b.length())
{
bChunkIsNum = (b[bNumIndex] == L'-' && (b[bNumIndex + 1] >= L'0' && b[bNumIndex + 1] <= L'9'));
bNumSign = -1;
}
// One is number and one is string // One is number and one is string
if (aChunkIsNum != bChunkIsNum) if (aChunkIsNum != bChunkIsNum)
{ {
compareResult = a[i] - b[i]; compareResult = a[aNumIndex] - b[bNumIndex];
// No need to update i; compareResult != 0
// compareResult isn't necessarily 0
// consider this case: "0-0", "0-"
// "-0" is considered a number, but "-" isn't
// but we are comparing two "-', which is the same
aNumIndex++;
bNumIndex++;
} }
// Both are numbers // Both are numbers
else if (aChunkIsNum) else if (aChunkIsNum)
{ {
size_t delta = 0; // if the sign is differemt, just return the compareResult
if (aNumSign != bNumSign)
{
if (aNumSign == 1)
{
compareResult = 1;
}
else
{
compareResult = -1;
}
// No need to update anything; compareResult != 0; will break out while loop
}
else
{
if (aNumSign == -1)
{
aNumIndex++;
bNumIndex++;
}
// stoll crashes if number exceeds the limit for unsigned long long size_t aNumEnd = a.find_first_not_of(L"1234567890", aNumIndex);
// Maximum value for a variable of type unsigned long long | 18446744073709551615 if (aNumEnd == generic_string::npos)
// So take the max length 18 to convert the number {
const size_t maxLen = 18; aNumEnd = a.length();
compareResult = std::stoll(a.substr(i, maxLen)) - std::stoll(b.substr(i, maxLen), &delta); }
i += delta;
size_t bNumEnd = b.find_first_not_of(L"1234567890", bNumIndex);
if (bNumEnd == generic_string::npos)
{
bNumEnd = b.length();
}
int aZeroNum = 0;
while (aNumIndex < a.length() && a[aNumIndex] == '0')
{
aZeroNum++;
aNumIndex++;
}
int bZeroNum = 0;
while (bNumIndex < b.length() && b[bNumIndex] == '0')
{
bZeroNum++;
bNumIndex++;
}
size_t aNumLength = aNumEnd - aNumIndex;
size_t bNumLength = bNumEnd - bNumIndex;
// aNum is longer than bNum, must be larger (smaller if negative)
if (aNumLength > bNumLength)
{
compareResult = 1 * aNumSign;
// No need to update anything; compareResult != 0; will break out while loop
}
// bNum is longer than aNum, must be larger (smaller if negative)
else if (aNumLength < bNumLength)
{
compareResult = -1 * aNumSign;
// No need to update anything; compareResult != 0; will break out while loop
}
else
{
// the lengths of the numbers are equal
// compare the two number. However, we can not use std::stoll
// because the number strings can be every large, well over the maximum long long value
// thus, we compare each digit one by one
while (compareResult == 0
&& aNumIndex < a.length()
&& (a[aNumIndex] >= L'0' && a[aNumIndex] <= L'9')
&& bNumIndex < b.length()
&& (b[bNumIndex] >= L'0' && b[bNumIndex] <= L'9'))
{
compareResult = (a[aNumIndex] - b[bNumIndex]) * aNumSign;
aNumIndex++;
bNumIndex++;
}
if (compareResult == 0)
{
compareResult = bZeroNum - aZeroNum;
}
}
}
} }
// Both are strings // Both are strings
else else
{ {
size_t aChunkEnd = a.find_first_of(L"1234567890", i); if (a[aNumIndex] == L'-')
size_t bChunkEnd = b.find_first_of(L"1234567890", i); {
compareResult = a.compare(i, aChunkEnd - i, b, i, bChunkEnd - i); aNumIndex++;
i = aChunkEnd; }
if (b[bNumIndex] == L'-')
{
bNumIndex++;
}
size_t aChunkEnd = a.find_first_of(L"1234567890-", aNumIndex);
size_t bChunkEnd = b.find_first_of(L"1234567890-", bNumIndex);
compareResult = a.compare(aNumIndex, aChunkEnd - aNumIndex, b, bNumIndex, bChunkEnd - bNumIndex);
aNumIndex = aChunkEnd;
bNumIndex = bChunkEnd;
} }
} }
@ -233,46 +337,159 @@ public:
} }
else else
{ {
std::sort(lines.begin(), lines.end(), [this](generic_string a, generic_string b) std::sort(lines.begin(), lines.end(), [this](generic_string aIn, generic_string bIn)
{ {
generic_string a = aIn;
generic_string b = bIn;
long long compareResult = 0; long long compareResult = 0;
size_t i = 0; size_t aNumIndex = 0;
size_t bNumIndex = 0;
while (compareResult == 0) while (compareResult == 0)
{ {
if (i >= a.length() || i >= b.length()) if (aNumIndex >= a.length() || bNumIndex >= b.length())
{ {
compareResult = a.compare(min(i,a.length()), generic_string::npos, b, min(i,b.length()), generic_string::npos); compareResult = a.compare(min(aNumIndex, a.length()), generic_string::npos, b, min(bNumIndex, b.length()), generic_string::npos);
break; break;
} }
bool aChunkIsNum = a[i] >= L'0' && a[i] <= L'9'; bool aChunkIsNum = a[aNumIndex] >= L'0' && a[aNumIndex] <= L'9';
bool bChunkIsNum = b[i] >= L'0' && b[i] <= L'9'; bool bChunkIsNum = b[bNumIndex] >= L'0' && b[bNumIndex] <= L'9';
int aNumSign = 1;
// Could be start of negative number
if (!aChunkIsNum && (aNumIndex + 1) < a.length())
{
aChunkIsNum = (a[aNumIndex] == L'-' && (a[aNumIndex + 1] >= L'0' && a[aNumIndex + 1] <= L'9'));
aNumSign = -1;
}
int bNumSign = 1;
if (!bChunkIsNum && (bNumIndex + 1) < b.length())
{
bChunkIsNum = (b[bNumIndex] == L'-' && (b[bNumIndex + 1] >= L'0' && b[bNumIndex + 1] <= L'9'));
bNumSign = -1;
}
// One is number and one is string // One is number and one is string
if (aChunkIsNum != bChunkIsNum) if (aChunkIsNum != bChunkIsNum)
{ {
compareResult = a[i] - b[i]; compareResult = a[aNumIndex] - b[bNumIndex];
// No need to update i; compareResult != 0
// compareResult isn't necessarily 0
// consider this case: "0-0", "0-"
// "-0" is considered a number, but "-" isn't
// but we are comparing two "-', which is the same
// increment the indexes for this case and continue the loop
aNumIndex++;
bNumIndex++;
} }
// Both are numbers // Both are numbers
else if (aChunkIsNum) else if (aChunkIsNum)
{ {
size_t delta = 0; // if the sign is differemt, just return the compareResult
if (aNumSign != bNumSign)
{
if (aNumSign == 1)
{
compareResult = 1;
}
else
{
compareResult = -1;
}
// No need to update anything; compareResult != 0; will break out while loop
}
else
{
if (aNumSign == -1)
{
aNumIndex++;
bNumIndex++;
}
// stoll crashes if number exceeds the limit for unsigned long long size_t aNumEnd = a.find_first_not_of(L"1234567890", aNumIndex);
// Maximum value for a variable of type unsigned long long | 18446744073709551615 if (aNumEnd == generic_string::npos)
// So take the max length 18 to convert the number {
const size_t maxLen = 18; aNumEnd = a.length();
compareResult = std::stoll(a.substr(i, maxLen)) - std::stoll(b.substr(i, maxLen), &delta); }
i += delta;
size_t bNumEnd = b.find_first_not_of(L"1234567890", bNumIndex);
if (bNumEnd == generic_string::npos)
{
bNumEnd = b.length();
}
int aZeroNum = 0;
while (aNumIndex < a.length() && a[aNumIndex] == '0')
{
aZeroNum++;
aNumIndex++;
}
int bZeroNum = 0;
while (bNumIndex < b.length() && b[bNumIndex] == '0')
{
bZeroNum++;
bNumIndex++;
}
size_t aNumLength = aNumEnd - aNumIndex;
size_t bNumLength = bNumEnd - bNumIndex;
// aNum is longer than bNum, must be larger (smaller if negative)
if (aNumLength > bNumLength)
{
compareResult = 1 * aNumSign;
// No need to update anything; compareResult != 0; will break out while loop
}
// bNum is longer than aNum, must be larger (smaller if negative)
else if (aNumLength < bNumLength)
{
compareResult = -1 * aNumSign;
// No need to update anything; compareResult != 0; will break out while loop
}
else
{
// the lengths of the numbers are equal
// compare the two number. However, we can not use std::stoll
// because the number strings can be every large, well over the maximum long long value
// thus, we compare each digit one by one
while (compareResult == 0
&& aNumIndex < a.length()
&& (a[aNumIndex] >= L'0' && a[aNumIndex] <= L'9')
&& bNumIndex < b.length()
&& (b[bNumIndex] >= L'0' && b[bNumIndex] <= L'9'))
{
compareResult = (a[aNumIndex] - b[bNumIndex]) * aNumSign;
aNumIndex++;
bNumIndex++;
}
if (compareResult == 0)
{
compareResult = bZeroNum - aZeroNum;
}
}
}
} }
// Both are strings // Both are strings
else else
{ {
size_t aChunkEnd = a.find_first_of(L"1234567890", i); if (a[aNumIndex] == L'-')
size_t bChunkEnd = b.find_first_of(L"1234567890", i); {
compareResult = a.compare(i, aChunkEnd-i, b, i, bChunkEnd-i); aNumIndex++;
i = aChunkEnd; }
if (b[bNumIndex] == L'-')
{
bNumIndex++;
}
size_t aChunkEnd = a.find_first_of(L"1234567890-", aNumIndex);
size_t bChunkEnd = b.find_first_of(L"1234567890-", bNumIndex);
compareResult = a.compare(aNumIndex, aChunkEnd - aNumIndex, b, bNumIndex, bChunkEnd - bNumIndex);
aNumIndex = aChunkEnd;
bNumIndex = bChunkEnd;
} }
} }
@ -286,6 +503,8 @@ public:
} }
}); });
} }
return lines; return lines;
} }
}; };

View File

@ -708,7 +708,7 @@ void Notepad_plus::command(int id)
} }
else if (id == IDM_EDIT_SORTLINES_INTEGER_DESCENDING || id == IDM_EDIT_SORTLINES_INTEGER_ASCENDING) else if (id == IDM_EDIT_SORTLINES_INTEGER_DESCENDING || id == IDM_EDIT_SORTLINES_INTEGER_ASCENDING)
{ {
pSorter = std::unique_ptr<ISorter>(new NaturalSorter(isDescending, fromColumn, toColumn)); pSorter = std::unique_ptr<ISorter>(new IntegerSorter(isDescending, fromColumn, toColumn));
} }
else if (id == IDM_EDIT_SORTLINES_DECIMALCOMMA_DESCENDING || id == IDM_EDIT_SORTLINES_DECIMALCOMMA_ASCENDING) else if (id == IDM_EDIT_SORTLINES_DECIMALCOMMA_DESCENDING || id == IDM_EDIT_SORTLINES_DECIMALCOMMA_ASCENDING)
{ {