From 721f994df8c5ea8085a6c9b82a4d6a85110f6f6e Mon Sep 17 00:00:00 2001 From: wiseyestudio00 Date: Wed, 6 Apr 2022 02:07:47 -0400 Subject: [PATCH] Fix "Sort Lines as Integer" not considering negative number The current implementation is a variant of Natural Sort, which takes in account of negative numbers. Fix #11023, fix #2025, close #11481 --- PowerEditor/src/MISC/Common/Sorters.h | 307 ++++++++++++++++++++++---- PowerEditor/src/NppCommands.cpp | 2 +- 2 files changed, 264 insertions(+), 45 deletions(-) diff --git a/PowerEditor/src/MISC/Common/Sorters.h b/PowerEditor/src/MISC/Common/Sorters.h index 6ceb961fa..df33819ac 100644 --- a/PowerEditor/src/MISC/Common/Sorters.h +++ b/PowerEditor/src/MISC/Common/Sorters.h @@ -161,18 +161,13 @@ public: } }; -// Treat consecutive numerals as one number -// Otherwise it is a lexicographic sort -class NaturalSorter : public ISorter +class IntegerSorter : public ISorter { public: - NaturalSorter(bool isDescending, size_t fromColumn, size_t toColumn) : ISorter(isDescending, fromColumn, toColumn) { }; + IntegerSorter(bool isDescending, size_t fromColumn, size_t toColumn) : ISorter(isDescending, fromColumn, toColumn) { }; std::vector sort(std::vector lines) override { - // Note that both branches here are equivalent in the sense that they always give the same answer. - // However, if we are *not* sorting specific columns, then we get a 40% speed improvement by not calling - // getSortKey() so many times. if (isSortingSpecificColumns()) { std::sort(lines.begin(), lines.end(), [this](generic_string aIn, generic_string bIn) @@ -181,43 +176,152 @@ public: generic_string b = getSortKey(bIn); long long compareResult = 0; - size_t i = 0; + size_t aNumIndex = 0; + size_t bNumIndex = 0; while (compareResult == 0) { - if (i >= a.length() || i >= b.length()) + if (aNumIndex >= a.length() || bNumIndex >= b.length()) { - compareResult = a.compare(min(i, a.length()), generic_string::npos, b, min(i, b.length()), generic_string::npos); + compareResult = a.compare(min(aNumIndex, a.length()), generic_string::npos, b, min(bNumIndex, b.length()), generic_string::npos); break; } - bool aChunkIsNum = a[i] >= L'0' && a[i] <= L'9'; - bool bChunkIsNum = b[i] >= L'0' && b[i] <= L'9'; + bool aChunkIsNum = a[aNumIndex] >= L'0' && a[aNumIndex] <= L'9'; + bool bChunkIsNum = b[bNumIndex] >= L'0' && b[bNumIndex] <= L'9'; + + int aNumSign = 1; + // Could be start of negative number + if (!aChunkIsNum && (aNumIndex + 1) < a.length()) + { + aChunkIsNum = (a[aNumIndex] == L'-' && (a[aNumIndex + 1] >= L'0' && a[aNumIndex + 1] <= L'9')); + aNumSign = -1; + } + + int bNumSign = 1; + if (!bChunkIsNum && (bNumIndex + 1) < b.length()) + { + bChunkIsNum = (b[bNumIndex] == L'-' && (b[bNumIndex + 1] >= L'0' && b[bNumIndex + 1] <= L'9')); + bNumSign = -1; + } // One is number and one is string if (aChunkIsNum != bChunkIsNum) { - compareResult = a[i] - b[i]; - // No need to update i; compareResult != 0 + compareResult = a[aNumIndex] - b[bNumIndex]; + + // compareResult isn't necessarily 0 + // consider this case: "0-0", "0-" + // "-0" is considered a number, but "-" isn't + // but we are comparing two "-', which is the same + aNumIndex++; + bNumIndex++; } // Both are numbers else if (aChunkIsNum) { - size_t delta = 0; + // if the sign is differemt, just return the compareResult + if (aNumSign != bNumSign) + { + if (aNumSign == 1) + { + compareResult = 1; + } + else + { + compareResult = -1; + } + // No need to update anything; compareResult != 0; will break out while loop + } + else + { + if (aNumSign == -1) + { + aNumIndex++; + bNumIndex++; + } - // stoll crashes if number exceeds the limit for unsigned long long - // Maximum value for a variable of type unsigned long long | 18446744073709551615 - // So take the max length 18 to convert the number - const size_t maxLen = 18; - compareResult = std::stoll(a.substr(i, maxLen)) - std::stoll(b.substr(i, maxLen), &delta); - i += delta; + size_t aNumEnd = a.find_first_not_of(L"1234567890", aNumIndex); + if (aNumEnd == generic_string::npos) + { + aNumEnd = a.length(); + } + + size_t bNumEnd = b.find_first_not_of(L"1234567890", bNumIndex); + if (bNumEnd == generic_string::npos) + { + bNumEnd = b.length(); + } + + int aZeroNum = 0; + while (aNumIndex < a.length() && a[aNumIndex] == '0') + { + aZeroNum++; + aNumIndex++; + } + + int bZeroNum = 0; + while (bNumIndex < b.length() && b[bNumIndex] == '0') + { + bZeroNum++; + bNumIndex++; + } + + size_t aNumLength = aNumEnd - aNumIndex; + size_t bNumLength = bNumEnd - bNumIndex; + + // aNum is longer than bNum, must be larger (smaller if negative) + if (aNumLength > bNumLength) + { + compareResult = 1 * aNumSign; + // No need to update anything; compareResult != 0; will break out while loop + } + // bNum is longer than aNum, must be larger (smaller if negative) + else if (aNumLength < bNumLength) + { + compareResult = -1 * aNumSign; + // No need to update anything; compareResult != 0; will break out while loop + } + else + { + // the lengths of the numbers are equal + // compare the two number. However, we can not use std::stoll + // because the number strings can be every large, well over the maximum long long value + // thus, we compare each digit one by one + while (compareResult == 0 + && aNumIndex < a.length() + && (a[aNumIndex] >= L'0' && a[aNumIndex] <= L'9') + && bNumIndex < b.length() + && (b[bNumIndex] >= L'0' && b[bNumIndex] <= L'9')) + { + compareResult = (a[aNumIndex] - b[bNumIndex]) * aNumSign; + aNumIndex++; + bNumIndex++; + } + + if (compareResult == 0) + { + compareResult = bZeroNum - aZeroNum; + } + } + } } // Both are strings else { - size_t aChunkEnd = a.find_first_of(L"1234567890", i); - size_t bChunkEnd = b.find_first_of(L"1234567890", i); - compareResult = a.compare(i, aChunkEnd - i, b, i, bChunkEnd - i); - i = aChunkEnd; + if (a[aNumIndex] == L'-') + { + aNumIndex++; + } + if (b[bNumIndex] == L'-') + { + bNumIndex++; + } + + size_t aChunkEnd = a.find_first_of(L"1234567890-", aNumIndex); + size_t bChunkEnd = b.find_first_of(L"1234567890-", bNumIndex); + compareResult = a.compare(aNumIndex, aChunkEnd - aNumIndex, b, bNumIndex, bChunkEnd - bNumIndex); + aNumIndex = aChunkEnd; + bNumIndex = bChunkEnd; } } @@ -233,46 +337,159 @@ public: } else { - std::sort(lines.begin(), lines.end(), [this](generic_string a, generic_string b) + std::sort(lines.begin(), lines.end(), [this](generic_string aIn, generic_string bIn) { + generic_string a = aIn; + generic_string b = bIn; + long long compareResult = 0; - size_t i = 0; + size_t aNumIndex = 0; + size_t bNumIndex = 0; while (compareResult == 0) { - if (i >= a.length() || i >= b.length()) + if (aNumIndex >= a.length() || bNumIndex >= b.length()) { - compareResult = a.compare(min(i,a.length()), generic_string::npos, b, min(i,b.length()), generic_string::npos); + compareResult = a.compare(min(aNumIndex, a.length()), generic_string::npos, b, min(bNumIndex, b.length()), generic_string::npos); break; } - bool aChunkIsNum = a[i] >= L'0' && a[i] <= L'9'; - bool bChunkIsNum = b[i] >= L'0' && b[i] <= L'9'; + bool aChunkIsNum = a[aNumIndex] >= L'0' && a[aNumIndex] <= L'9'; + bool bChunkIsNum = b[bNumIndex] >= L'0' && b[bNumIndex] <= L'9'; + + int aNumSign = 1; + // Could be start of negative number + if (!aChunkIsNum && (aNumIndex + 1) < a.length()) + { + aChunkIsNum = (a[aNumIndex] == L'-' && (a[aNumIndex + 1] >= L'0' && a[aNumIndex + 1] <= L'9')); + aNumSign = -1; + } + + int bNumSign = 1; + if (!bChunkIsNum && (bNumIndex + 1) < b.length()) + { + bChunkIsNum = (b[bNumIndex] == L'-' && (b[bNumIndex + 1] >= L'0' && b[bNumIndex + 1] <= L'9')); + bNumSign = -1; + } // One is number and one is string if (aChunkIsNum != bChunkIsNum) { - compareResult = a[i] - b[i]; - // No need to update i; compareResult != 0 + compareResult = a[aNumIndex] - b[bNumIndex]; + + // compareResult isn't necessarily 0 + // consider this case: "0-0", "0-" + // "-0" is considered a number, but "-" isn't + // but we are comparing two "-', which is the same + // increment the indexes for this case and continue the loop + aNumIndex++; + bNumIndex++; } // Both are numbers else if (aChunkIsNum) { - size_t delta = 0; + // if the sign is differemt, just return the compareResult + if (aNumSign != bNumSign) + { + if (aNumSign == 1) + { + compareResult = 1; + } + else + { + compareResult = -1; + } + // No need to update anything; compareResult != 0; will break out while loop + } + else + { + if (aNumSign == -1) + { + aNumIndex++; + bNumIndex++; + } - // stoll crashes if number exceeds the limit for unsigned long long - // Maximum value for a variable of type unsigned long long | 18446744073709551615 - // So take the max length 18 to convert the number - const size_t maxLen = 18; - compareResult = std::stoll(a.substr(i, maxLen)) - std::stoll(b.substr(i, maxLen), &delta); - i += delta; + size_t aNumEnd = a.find_first_not_of(L"1234567890", aNumIndex); + if (aNumEnd == generic_string::npos) + { + aNumEnd = a.length(); + } + + size_t bNumEnd = b.find_first_not_of(L"1234567890", bNumIndex); + if (bNumEnd == generic_string::npos) + { + bNumEnd = b.length(); + } + + int aZeroNum = 0; + while (aNumIndex < a.length() && a[aNumIndex] == '0') + { + aZeroNum++; + aNumIndex++; + } + + int bZeroNum = 0; + while (bNumIndex < b.length() && b[bNumIndex] == '0') + { + bZeroNum++; + bNumIndex++; + } + + size_t aNumLength = aNumEnd - aNumIndex; + size_t bNumLength = bNumEnd - bNumIndex; + + // aNum is longer than bNum, must be larger (smaller if negative) + if (aNumLength > bNumLength) + { + compareResult = 1 * aNumSign; + // No need to update anything; compareResult != 0; will break out while loop + } + // bNum is longer than aNum, must be larger (smaller if negative) + else if (aNumLength < bNumLength) + { + compareResult = -1 * aNumSign; + // No need to update anything; compareResult != 0; will break out while loop + } + else + { + // the lengths of the numbers are equal + // compare the two number. However, we can not use std::stoll + // because the number strings can be every large, well over the maximum long long value + // thus, we compare each digit one by one + while (compareResult == 0 + && aNumIndex < a.length() + && (a[aNumIndex] >= L'0' && a[aNumIndex] <= L'9') + && bNumIndex < b.length() + && (b[bNumIndex] >= L'0' && b[bNumIndex] <= L'9')) + { + compareResult = (a[aNumIndex] - b[bNumIndex]) * aNumSign; + aNumIndex++; + bNumIndex++; + } + + if (compareResult == 0) + { + compareResult = bZeroNum - aZeroNum; + } + } + } } // Both are strings else { - size_t aChunkEnd = a.find_first_of(L"1234567890", i); - size_t bChunkEnd = b.find_first_of(L"1234567890", i); - compareResult = a.compare(i, aChunkEnd-i, b, i, bChunkEnd-i); - i = aChunkEnd; + if (a[aNumIndex] == L'-') + { + aNumIndex++; + } + if (b[bNumIndex] == L'-') + { + bNumIndex++; + } + + size_t aChunkEnd = a.find_first_of(L"1234567890-", aNumIndex); + size_t bChunkEnd = b.find_first_of(L"1234567890-", bNumIndex); + compareResult = a.compare(aNumIndex, aChunkEnd - aNumIndex, b, bNumIndex, bChunkEnd - bNumIndex); + aNumIndex = aChunkEnd; + bNumIndex = bChunkEnd; } } @@ -286,6 +503,8 @@ public: } }); } + + return lines; } }; diff --git a/PowerEditor/src/NppCommands.cpp b/PowerEditor/src/NppCommands.cpp index 05f44f768..809dbd152 100644 --- a/PowerEditor/src/NppCommands.cpp +++ b/PowerEditor/src/NppCommands.cpp @@ -708,7 +708,7 @@ void Notepad_plus::command(int id) } else if (id == IDM_EDIT_SORTLINES_INTEGER_DESCENDING || id == IDM_EDIT_SORTLINES_INTEGER_ASCENDING) { - pSorter = std::unique_ptr(new NaturalSorter(isDescending, fromColumn, toColumn)); + pSorter = std::unique_ptr(new IntegerSorter(isDescending, fromColumn, toColumn)); } else if (id == IDM_EDIT_SORTLINES_DECIMALCOMMA_DESCENDING || id == IDM_EDIT_SORTLINES_DECIMALCOMMA_ASCENDING) {