admin管理员组文章数量:1344557
Working Answer for extracting multiple Units and values fuzzily from messy data.
Extracts Scientific notation and regular numbers as well as values.
Will be updating as developed but working reasonably well for now.
Working Answer for extracting multiple Units and values fuzzily from messy data.
Extracts Scientific notation and regular numbers as well as values.
Will be updating as developed but working reasonably well for now.
Share Improve this question asked 17 hours ago NickNick 9192 gold badges12 silver badges44 bronze badges 2- 1 Have you considered uploading these to a GitHub or contributing them to one that already exists like: github/ImkeF/M – davidebacci Commented 16 hours ago
- admittedly no, however thats not a bad idea. – Nick Commented 16 hours ago
1 Answer
Reset to default 0(
targetText as text,
SearchPhrase1 as list,
optional ReturnPhrase1 as nullable list,
optional Conversion1 as nullable list,
optional Threshold1 as nullable number,
optional SearchDirection1 as nullable text,
optional SearchPhrase2 as nullable list,
optional ReturnPhrase2 as nullable list,
optional Conversion2 as nullable list,
optional Threshold2 as nullable number,
optional SearchDirection2 as nullable text
) as record =>
let
// --------------------------------------------------
// Global settings and utility functions
// --------------------------------------------------
actualThreshold1 = if Threshold1 <> null then Threshold1 else 0.5,
actualThreshold2 = if Threshold2 <> null then Threshold2 else 0.5,
direction1 = if SearchDirection1 <> null then Text.Lower(Text.Trim(SearchDirection1)) else "left",
direction2 = if SearchDirection2 <> null then Text.Lower(Text.Trim(SearchDirection2)) else "left",
NormalizeText = (txt as text) => Text.Lower(Text.Trim(txt)),
// --------------------------------------------------
// FnTextBeforeSecondDelimiterIgnoreCase
// --------------------------------------------------
FnTextBeforeSecondDelimiterIgnoreCase = (inputText as text, delimiter1 as text, optional delimiter2 as nullable text) as nullable text =>
let
delimiter2Actual = if delimiter2 = null or delimiter2 = "" then null else delimiter2,
result =
if delimiter2Actual = null then
inputText
else
let
lowerText = Text.Lower(inputText),
lowerDelim1 = Text.Lower(delimiter1),
lowerDelim2 = Text.Lower(delimiter2Actual),
firstPos1 = Text.PositionOf(lowerText, lowerDelim1),
firstPos2 = Text.PositionOf(lowerText, lowerDelim2),
firstPos = if firstPos1 = -1 then firstPos2 else if firstPos2 = -1 then firstPos1 else List.Min({firstPos1, firstPos2}),
remainingText = if firstPos <> -1 then Text.Range(inputText, firstPos + 1) else null,
resultText =
if remainingText <> null then
let
lowerRemaining = Text.Lower(remainingText),
secondPos1 = Text.PositionOf(lowerRemaining, lowerDelim1),
secondPos2 = Text.PositionOf(lowerRemaining, lowerDelim2),
secondPos = if secondPos1 = -1 then secondPos2 else if secondPos2 = -1 then secondPos1 else List.Min({secondPos1, secondPos2}),
secondDelim =
if secondPos1 <> -1 and (secondPos1 <= secondPos2 or secondPos2 = -1) then delimiter1 else delimiter2Actual,
secondPosFull = if secondPos <> -1 then firstPos + 1 + secondPos else -1,
result = if secondPosFull <> -1 then Text.Start(inputText, secondPosFull + Text.Length(secondDelim)) else null
in
result
else null
in
resultText
in
result,
// --------------------------------------------------
// GetWindowedText
// --------------------------------------------------
GetWindowedText = (source as text, phrase as nullable text, window as number) as text =>
let
lowerSource = Text.Lower(source),
lowerPhrase = if phrase <> null then Text.Lower(Text.Trim(phrase)) else null,
pos = if lowerPhrase <> null then Text.PositionOf(lowerSource, lowerPhrase) else -1,
segment =
if pos = -1 then source
else
let
start = List.Max({0, pos - window}),
len = List.Min({Text.Length(source) - start, Text.Length(lowerPhrase) + window * 2}),
result = Text.Range(source, start, len)
in
result
in
segment,
// --------------------------------------------------
// Levenshtein
// --------------------------------------------------
Levenshtein = (s as text, t as text) as number =>
let
sLen = Text.Length(s),
tLen = Text.Length(t),
sList = Text.ToList(s),
tList = Text.ToList(t),
initRow = List.Transform({0..tLen}, each _),
finalRow = List.Accumulate(
{1..sLen},
initRow,
(prevRow, i) =>
List.Accumulate(
{1..tLen},
{i},
(row, j) =>
let
cost = if sList{i - 1} = tList{j - 1} then 0 else 1,
deletion = row{j - 1} + 1,
insertion = prevRow{j} + 1,
substitution = prevRow{j - 1} + cost,
minVal = List.Min({deletion, insertion, substitution})
in
row & {minVal}
)
),
distance = finalRow{tLen}
in
distance,
// --------------------------------------------------
// ExtractValueFromSource
// --------------------------------------------------
ExtractValueFromSource = (source as text, phrase as nullable text, direction as text) as nullable text =>
let
Cleaned = Text.Lower(Text.Trim(Text.Replace(Text.Replace(source, "#(lf)", " "), ",", ""))),
rawPhrase = if phrase <> null then Text.Lower(Text.Trim(phrase)) else null,
baseNumberPattern = "(-?\d+(?:[.,]\d+)?(?:\s*[x*×]\s*10[-+]?\d+|[eE][-+]?\d+)?)",
leftPattern = baseNumberPattern & "[^a-zA-Z0-9]{0,10}" & rawPhrase,
rightPattern = rawPhrase & "[^a-zA-Z0-9]{0,10}" & baseNumberPattern,
getMatch = (pattern) =>
let
match = try RegexMatch(Cleaned, pattern, "i") otherwise null
in
if match <> null and List.Count(match) > 1 then match{1} else null,
result =
if phrase = null then null
else if direction = "right" then getMatch(rightPattern)
else if direction = "both" then
let
tryLeft = getMatch(leftPattern),
tryRight = getMatch(rightPattern)
in
if tryLeft <> null then tryLeft else tryRight
else getMatch(leftPattern)
in
result,
// --------------------------------------------------
// GetRangeTextFromSource
// --------------------------------------------------
GetRangeTextFromSource = (source as text) as nullable text =>
let
Cleaned = Text.Lower(Text.Trim(Text.Replace(Text.Replace(source, "#(lf)", " "), ",", ""))),
pattern = "(\d+(?:[.,]\d+)?)[\s]*[-–—][\s]*(\d+(?:[.,]\d+)?)",
match = try RegexMatch(Cleaned, pattern, "i") otherwise null,
result = if match <> null and List.Count(match) > 2 then match{1} & "-" & match{2} else null
in
result,
// --------------------------------------------------
// IsScientificText
// --------------------------------------------------
IsScientificText = (txt as text) as logical =>
let
CleanText = Text.Lower(Text.Trim(txt)),
Indicators = {
"e-", "e+",
"×10-", "× 10-",
"*10-", "* 10-",
"x10-", "x 10-",
" x10-", " x 10-",
"^10-", "^ 10-"
},
IsMatch = List.AnyTrue(List.Transform(Indicators, each Text.Contains(CleanText, _)))
in
IsMatch,
TextAfterIgnoreCase = (inputText as text, delimiter as text) as nullable text =>
let
lowerText = Text.Lower(inputText),
lowerDelimiter = Text.Lower(delimiter),
pos = Text.PositionOf(lowerText, lowerDelimiter),
result = if pos <> -1 then Text.Range(inputText, pos + Text.Length(delimiter)) else null
in
result,
// --------------------------------------------------
// FuzzyMatch (unchanged)
// --------------------------------------------------
FuzzyMatch = (
sourceText as text,
searchList as list,
optional returnList as nullable list,
optional threshold as nullable number
) as record =>
let
actualThreshold = if threshold <> null then threshold else 0.5,
zippedList =
if returnList <> null and List.Count(searchList) = List.Count(returnList) then
List.Transform(List.Zip({searchList, returnList}), each [SearchPhraseOriginal = _{0}, ReturnedPhrase = _{1}])
else
List.Transform(searchList, each [SearchPhraseOriginal = _, ReturnedPhrase = _]),
normText = NormalizeText(sourceText),
textLength = Text.Length(normText),
AllMatches = List.Transform(zippedList, (record) =>
let
originalPhrase = record[SearchPhraseOriginal],
normalizedPhrase = NormalizeText(originalPhrase),
combinedPhrase = Text.Replace(normalizedPhrase, " ", ""),
returnPhrase = record[ReturnedPhrase],
slidingPhrases = (searchPhrase as text) as list =>
let
sLen = Text.Length(searchPhrase),
range = {0..(textLength - sLen)},
windows = List.Transform(range, (i) =>
[
before = if i = 0 then null else Text.Middle(normText, i - 1, 1),
match = Text.Middle(normText, i, sLen),
after = if i + sLen >= textLength then null else Text.Middle(normText, i + sLen, 1)
]
),
isLetter = (c as nullable text) =>
if c = null then false else List.Contains({"a".."z", "A".."Z"}, c),
boundaryWindows = List.Select(windows, each
let
b = _[before],
a = _[after],
startOkay = b = null or not isLetter(b),
endOkay = a = null or not isLetter(a)
in
startOkay and endOkay
),
result = List.Transform(boundaryWindows, each _[match])
in
result,
TestPhrase = (searchPhrase as text) as record =>
let
possiblePhrases = slidingPhrases(searchPhrase),
scored = List.Transform(possiblePhrases, (phrase) =>
let
dist = Levenshtein(NormalizeText(phrase), searchPhrase),
maxLen = List.Max({Text.Length(phrase), Text.Length(searchPhrase)}),
similarity = if maxLen = 0 then 1 else 1 - (dist / maxLen),
result = [
SearchedPhrase = originalPhrase,
MatchedPhrase = phrase,
ReturnedPhrase = returnPhrase,
Similarity = similarity
]
in
result
),
good = List.Select(scored, each _[Similarity] >= actualThreshold),
best = if List.Count(good) = 0 then
[SearchedPhrase = null, MatchedPhrase = null, ReturnedPhrase = null, Similarity = null]
else
let
maxSim = List.Max(List.Transform(good, each _[Similarity])),
top = List.First(List.Select(good, each _[Similarity] = maxSim))
in
top
in
best,
bestFromSpaced = TestPhrase(normalizedPhrase),
bestFromCombined = TestPhrase(combinedPhrase),
FinalBest =
if bestFromSpaced[Similarity] = null and bestFromCombined[Similarity] = null then
[SearchedPhrase = null, MatchedPhrase = null, ReturnedPhrase = null, Similarity = null]
else if bestFromSpaced[Similarity] = null then
bestFromCombined
else if bestFromCombined[Similarity] = null then
bestFromSpaced
else if bestFromSpaced[Similarity] >= bestFromCombined[Similarity] then
bestFromSpaced
else
bestFromCombined
in
FinalBest
),
ValidMatches = List.Select(AllMatches, each _[Similarity] <> null),
FinalBest =
if List.Count(ValidMatches) = 0 then
[SearchedPhrase = null, MatchedPhrase = null, ReturnedPhrase = null, Similarity = null]
else
let
topScore = List.Max(List.Transform(ValidMatches, each _[Similarity])),
topMatch = List.First(List.Select(ValidMatches, each _[Similarity] = topScore))
in
topMatch
in
FinalBest,
// --------------------------------------------------
// MAIN MATCH & EXTRACTION PROCESS
// --------------------------------------------------
match1 = FuzzyMatch(targetText, SearchPhrase1, ReturnPhrase1, Threshold1),
match2 = if SearchPhrase2 <> null then FuzzyMatch(targetText, SearchPhrase2, ReturnPhrase2, Threshold2)
else [SearchedPhrase = null, MatchedPhrase = null, ReturnedPhrase = null, Similarity = null],
trimmedText = FnTextBeforeSecondDelimiterIgnoreCase(targetText, match1[MatchedPhrase], match2[MatchedPhrase]),
segment1 = GetWindowedText(trimmedText, match1[MatchedPhrase], 30),
val1 = ExtractValueFromSource(segment1, match1[MatchedPhrase], direction1),
isSci1 = if val1 <> null then IsScientificText(val1) else false,
ExtractedRange1 = if isSci1 then null else GetRangeTextFromSource(segment1),
finalVal1 = if isSci1 then val1 else if ExtractedRange1 <> null then Text.BeforeDelimiter(ExtractedRange1, "-") else val1,
tempSource = if match1[MatchedPhrase] <> null then TextAfterIgnoreCase(trimmedText, match1[MatchedPhrase]) else trimmedText,
segment2 = GetWindowedText(tempSource, match2[MatchedPhrase], 30),
val2 = ExtractValueFromSource(segment2, match2[MatchedPhrase], direction2),
isSci2 = if val2 <> null then IsScientificText(val2) else false,
ExtractedRange2 = if isSci2 then null else GetRangeTextFromSource(segment2),
finalVal2 = if isSci2 then val2 else if ExtractedRange2 <> null then Text.BeforeDelimiter(ExtractedRange2, "-") else val2,
// Final number conversion using updated logic
NormalizeFinalNumber = (valueText as nullable text) as nullable number =>
let
raw = if valueText = null then null else Text.Trim(valueText),
parts =
if raw <> null and Text.Contains(raw, "10") and
List.AnyTrue({Text.Contains(raw, "×"), Text.Contains(raw, "x"), Text.Contains(raw, "*")})
then
let
separators = {"×", "*", "x"},
found = List.First(List.Select(separators, each Text.Contains(raw, _)), null),
split1 = if found <> null then Text.Split(raw, found) else {raw},
mainPart = if List.Count(split1) = 2 then Text.Trim(split1{0}) else raw,
split2 = if List.Count(split1) = 2 then Text.Split(Text.Trim(split1{1}), "10") else {},
exponent = if List.Count(split2) = 2 then Text.Trim(split2{1}) else null,
recombined = if exponent <> null then mainPart & "e" & exponent else raw
in
recombined
else
raw,
sciCleaned =
Text.Replace(
Text.Replace(
Text.Replace(
Text.Replace(
Text.Replace(parts, "×", "e"),
"x10^", "e"),
"*10^", "e"),
"*10", "e"),
" x10", "e"),
cleaned =
if Text.Contains(sciCleaned, ",") and not Text.Contains(sciCleaned, ".")
then Text.Replace(Text.Remove(sciCleaned, {" "}), ",", ".")
else Text.Replace(Text.Remove(sciCleaned, {" "}), ",", ""),
asNumber = try Number.FromText(cleaned) otherwise null
in
asNumber,
// FnLookupInLists definition
FnLookupInLists = (
param1 as any,
param2 as nullable text,
SearchUnitList as list,
ReturnUnitList as list,
ConversionList as list
) as record =>
let
position = if param2 <> null then List.PositionOf(ReturnUnitList, param2) else -1,
Result =
if param2 = null then
[Found = false, Message = "Null parameter", ConvertedValue = null]
else if position = -1 then
[Found = false, Message = "No match found for: " & param2, ConvertedValue = null]
else
let
searchUnit = SearchUnitList{position},
returnUnit = ReturnUnitList{position},
conversion = ConversionList{position},
conversionTry = try Number.From(conversion),
convertedValue =
if param1 = null then
null
else if conversionTry[HasError] then
try Expression.Evaluate(Text.Replace(Text.From(conversion), "x", Number.ToText(param1)), [x = param1]) otherwise null
else if conversionTry[Value] = 1 then
param1
else
param1 * conversionTry[Value]
in
[
Found = true,
SearchUnit = searchUnit,
ReturnUnit = returnUnit,
Conversion = conversion,
ConvertedValue = convertedValue
]
in
Result,
// Run conversions if conversion lists are provided and ReturnedPhrase is not null
Converted1 =
if ReturnPhrase1 <> null and Conversion1 <> null and match1[ReturnedPhrase] <> null then
FnLookupInLists(NormalizeFinalNumber(finalVal1), match1[ReturnedPhrase], SearchPhrase1, ReturnPhrase1, Conversion1)
else
[ConvertedValue = null],
Converted2 =
if ReturnPhrase2 <> null and Conversion2 <> null and match2[ReturnedPhrase] <> null then
FnLookupInLists(NormalizeFinalNumber(finalVal2), match2[ReturnedPhrase], SearchPhrase2, ReturnPhrase2, Conversion2)
else
[ConvertedValue = null],
// Final output
output = try [
SearchedPhrase1 = match1[SearchedPhrase],
MatchedPhrase1 = match1[MatchedPhrase],
ReturnedPhrase1 = match1[ReturnedPhrase],
Similarity1 = match1[Similarity],
ExtractedValue1 = finalVal1,
IsScientific1 = isSci1,
ExtractedRange1 = ExtractedRange1,
ExtractedNumber1 = NormalizeFinalNumber(finalVal1),
ConvertedNumber1 = Converted1[ConvertedValue],
SearchedPhrase2 = match2[SearchedPhrase],
MatchedPhrase2 = match2[MatchedPhrase],
ReturnedPhrase2 = match2[ReturnedPhrase],
Similarity2 = match2[Similarity],
ExtractedValue2 = finalVal2,
IsScientific2 = isSci2,
ExtractedRange2 = ExtractedRange2,
ExtractedNumber2 = NormalizeFinalNumber(finalVal2),
ConvertedNumber2 = Converted2[ConvertedValue]
] otherwise null
in
output
本文标签: powerqueryFuzzyExtract Units and Values in Power QueryStack Overflow
版权声明:本文标题:powerquery - FuzzyExtract Units and Values in Power Query - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1743759307a2534079.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论