admin管理员组

文章数量:1344557

Working Answer for extracting multiple Units and values fuzzily from messy data.

Extracts Scientific notation and regular numbers as well as values.

Will be updating as developed but working reasonably well for now.

Working Answer for extracting multiple Units and values fuzzily from messy data.

Extracts Scientific notation and regular numbers as well as values.

Will be updating as developed but working reasonably well for now.

Share Improve this question asked 17 hours ago NickNick 9192 gold badges12 silver badges44 bronze badges 2
  • 1 Have you considered uploading these to a GitHub or contributing them to one that already exists like: github/ImkeF/M – davidebacci Commented 16 hours ago
  • admittedly no, however thats not a bad idea. – Nick Commented 16 hours ago
Add a comment  | 

1 Answer 1

Reset to default 0
(
    targetText as text,
    SearchPhrase1 as list,
    optional ReturnPhrase1 as nullable list,
    optional Conversion1 as nullable list,

    optional Threshold1 as nullable number,
    optional SearchDirection1 as nullable text,
    
    optional SearchPhrase2 as nullable list,
    optional ReturnPhrase2 as nullable list,
    optional Conversion2 as nullable list,
    
    
    optional Threshold2 as nullable number,
    optional SearchDirection2 as nullable text
    
    
) as record =>

let
    // --------------------------------------------------
    // Global settings and utility functions
    // --------------------------------------------------
    actualThreshold1 = if Threshold1 <> null then Threshold1 else 0.5,
    actualThreshold2 = if Threshold2 <> null then Threshold2 else 0.5,
    direction1 = if SearchDirection1 <> null then Text.Lower(Text.Trim(SearchDirection1)) else "left",
    direction2 = if SearchDirection2 <> null then Text.Lower(Text.Trim(SearchDirection2)) else "left",
    NormalizeText = (txt as text) => Text.Lower(Text.Trim(txt)),

    // --------------------------------------------------
    // FnTextBeforeSecondDelimiterIgnoreCase
    // --------------------------------------------------
    FnTextBeforeSecondDelimiterIgnoreCase = (inputText as text, delimiter1 as text, optional delimiter2 as nullable text) as nullable text =>
        let
            delimiter2Actual = if delimiter2 = null or delimiter2 = "" then null else delimiter2,
            result = 
                if delimiter2Actual = null then
                    inputText
                else
                    let
                        lowerText = Text.Lower(inputText),
                        lowerDelim1 = Text.Lower(delimiter1),
                        lowerDelim2 = Text.Lower(delimiter2Actual),
                        firstPos1 = Text.PositionOf(lowerText, lowerDelim1),
                        firstPos2 = Text.PositionOf(lowerText, lowerDelim2),
                        firstPos = if firstPos1 = -1 then firstPos2 else if firstPos2 = -1 then firstPos1 else List.Min({firstPos1, firstPos2}),
                        remainingText = if firstPos <> -1 then Text.Range(inputText, firstPos + 1) else null,
                        resultText =
                            if remainingText <> null then
                                let
                                    lowerRemaining = Text.Lower(remainingText),
                                    secondPos1 = Text.PositionOf(lowerRemaining, lowerDelim1),
                                    secondPos2 = Text.PositionOf(lowerRemaining, lowerDelim2),
                                    secondPos = if secondPos1 = -1 then secondPos2 else if secondPos2 = -1 then secondPos1 else List.Min({secondPos1, secondPos2}),
                                    secondDelim =
                                        if secondPos1 <> -1 and (secondPos1 <= secondPos2 or secondPos2 = -1) then delimiter1 else delimiter2Actual,
                                    secondPosFull = if secondPos <> -1 then firstPos + 1 + secondPos else -1,
                                    result = if secondPosFull <> -1 then Text.Start(inputText, secondPosFull + Text.Length(secondDelim)) else null
                                in
                                    result
                            else null
                    in
                        resultText
        in
            result,

    // --------------------------------------------------
    // GetWindowedText
    // --------------------------------------------------
    GetWindowedText = (source as text, phrase as nullable text, window as number) as text =>
        let
            lowerSource = Text.Lower(source),
            lowerPhrase = if phrase <> null then Text.Lower(Text.Trim(phrase)) else null,
            pos = if lowerPhrase <> null then Text.PositionOf(lowerSource, lowerPhrase) else -1,
            segment =
                if pos = -1 then source
                else
                    let
                        start = List.Max({0, pos - window}),
                        len = List.Min({Text.Length(source) - start, Text.Length(lowerPhrase) + window * 2}),
                        result = Text.Range(source, start, len)
                    in
                        result
        in
            segment,

    // --------------------------------------------------
    // Levenshtein
    // --------------------------------------------------
    Levenshtein = (s as text, t as text) as number =>
        let
            sLen = Text.Length(s),
            tLen = Text.Length(t),
            sList = Text.ToList(s),
            tList = Text.ToList(t),
            initRow = List.Transform({0..tLen}, each _),
            finalRow = List.Accumulate(
                {1..sLen},
                initRow,
                (prevRow, i) =>
                    List.Accumulate(
                        {1..tLen},
                        {i},
                        (row, j) =>
                            let
                                cost = if sList{i - 1} = tList{j - 1} then 0 else 1,
                                deletion = row{j - 1} + 1,
                                insertion = prevRow{j} + 1,
                                substitution = prevRow{j - 1} + cost,
                                minVal = List.Min({deletion, insertion, substitution})
                            in
                                row & {minVal}
                    )
            ),
            distance = finalRow{tLen}
        in
            distance,

    // --------------------------------------------------
    // ExtractValueFromSource
    // --------------------------------------------------
    ExtractValueFromSource = (source as text, phrase as nullable text, direction as text) as nullable text =>
        let
            Cleaned = Text.Lower(Text.Trim(Text.Replace(Text.Replace(source, "#(lf)", " "), ",", ""))),
            rawPhrase = if phrase <> null then Text.Lower(Text.Trim(phrase)) else null,
            baseNumberPattern = "(-?\d+(?:[.,]\d+)?(?:\s*[x*×]\s*10[-+]?\d+|[eE][-+]?\d+)?)",
            leftPattern = baseNumberPattern & "[^a-zA-Z0-9]{0,10}" & rawPhrase,
            rightPattern = rawPhrase & "[^a-zA-Z0-9]{0,10}" & baseNumberPattern,
            getMatch = (pattern) =>
                let
                    match = try RegexMatch(Cleaned, pattern, "i") otherwise null
                in
                    if match <> null and List.Count(match) > 1 then match{1} else null,
            result =
                if phrase = null then null
                else if direction = "right" then getMatch(rightPattern)
                else if direction = "both" then
                    let
                        tryLeft = getMatch(leftPattern),
                        tryRight = getMatch(rightPattern)
                    in
                        if tryLeft <> null then tryLeft else tryRight
                else getMatch(leftPattern)
        in
            result,

    // --------------------------------------------------
    // GetRangeTextFromSource
    // --------------------------------------------------
    GetRangeTextFromSource = (source as text) as nullable text =>
        let
            Cleaned = Text.Lower(Text.Trim(Text.Replace(Text.Replace(source, "#(lf)", " "), ",", ""))),
            pattern = "(\d+(?:[.,]\d+)?)[\s]*[-–—][\s]*(\d+(?:[.,]\d+)?)",
            match = try RegexMatch(Cleaned, pattern, "i") otherwise null,
            result = if match <> null and List.Count(match) > 2 then match{1} & "-" & match{2} else null
        in
            result,

    // --------------------------------------------------
    // IsScientificText
    // --------------------------------------------------
    IsScientificText = (txt as text) as logical =>
        let
            CleanText = Text.Lower(Text.Trim(txt)),
            Indicators = {
                "e-", "e+",
                "×10-", "× 10-",
                "*10-", "* 10-",
                "x10-", "x 10-",
                " x10-", " x 10-",
                "^10-", "^ 10-"
            },
            IsMatch = List.AnyTrue(List.Transform(Indicators, each Text.Contains(CleanText, _)))
        in
            IsMatch,

    TextAfterIgnoreCase = (inputText as text, delimiter as text) as nullable text =>
        let
            lowerText = Text.Lower(inputText),
            lowerDelimiter = Text.Lower(delimiter),
            pos = Text.PositionOf(lowerText, lowerDelimiter),
            result = if pos <> -1 then Text.Range(inputText, pos + Text.Length(delimiter)) else null
        in
            result,

    // --------------------------------------------------
    // FuzzyMatch (unchanged)
    // --------------------------------------------------
    FuzzyMatch = (
        sourceText as text,
        searchList as list,
        optional returnList as nullable list,
        optional threshold as nullable number
    ) as record =>
        let
            actualThreshold = if threshold <> null then threshold else 0.5,
            zippedList =
                if returnList <> null and List.Count(searchList) = List.Count(returnList) then
                    List.Transform(List.Zip({searchList, returnList}), each [SearchPhraseOriginal = _{0}, ReturnedPhrase = _{1}])
                else
                    List.Transform(searchList, each [SearchPhraseOriginal = _, ReturnedPhrase = _]),
            normText = NormalizeText(sourceText),
            textLength = Text.Length(normText),
            AllMatches = List.Transform(zippedList, (record) =>
                let
                    originalPhrase = record[SearchPhraseOriginal],
                    normalizedPhrase = NormalizeText(originalPhrase),
                    combinedPhrase = Text.Replace(normalizedPhrase, " ", ""),
                    returnPhrase = record[ReturnedPhrase],
                    slidingPhrases = (searchPhrase as text) as list =>
                        let
                            sLen = Text.Length(searchPhrase),
                            range = {0..(textLength - sLen)},
                            windows = List.Transform(range, (i) =>
                                [
                                    before = if i = 0 then null else Text.Middle(normText, i - 1, 1),
                                    match = Text.Middle(normText, i, sLen),
                                    after = if i + sLen >= textLength then null else Text.Middle(normText, i + sLen, 1)
                                ]
                            ),
                            isLetter = (c as nullable text) =>
                                if c = null then false else List.Contains({"a".."z", "A".."Z"}, c),
                            boundaryWindows = List.Select(windows, each
                                let
                                    b = _[before],
                                    a = _[after],
                                    startOkay = b = null or not isLetter(b),
                                    endOkay = a = null or not isLetter(a)
                                in
                                    startOkay and endOkay
                            ),
                            result = List.Transform(boundaryWindows, each _[match])
                        in
                            result,
                    TestPhrase = (searchPhrase as text) as record =>
                        let
                            possiblePhrases = slidingPhrases(searchPhrase),
                            scored = List.Transform(possiblePhrases, (phrase) =>
                                let
                                    dist = Levenshtein(NormalizeText(phrase), searchPhrase),
                                    maxLen = List.Max({Text.Length(phrase), Text.Length(searchPhrase)}),
                                    similarity = if maxLen = 0 then 1 else 1 - (dist / maxLen),
                                    result = [
                                        SearchedPhrase = originalPhrase,
                                        MatchedPhrase = phrase,
                                        ReturnedPhrase = returnPhrase,
                                        Similarity = similarity
                                    ]
                                in
                                    result
                            ),
                            good = List.Select(scored, each _[Similarity] >= actualThreshold),
                            best = if List.Count(good) = 0 then
                                [SearchedPhrase = null, MatchedPhrase = null, ReturnedPhrase = null, Similarity = null]
                            else
                                let
                                    maxSim = List.Max(List.Transform(good, each _[Similarity])),
                                    top = List.First(List.Select(good, each _[Similarity] = maxSim))
                                in
                                    top
                        in
                            best,
                    bestFromSpaced = TestPhrase(normalizedPhrase),
                    bestFromCombined = TestPhrase(combinedPhrase),
                    FinalBest =
                        if bestFromSpaced[Similarity] = null and bestFromCombined[Similarity] = null then
                            [SearchedPhrase = null, MatchedPhrase = null, ReturnedPhrase = null, Similarity = null]
                        else if bestFromSpaced[Similarity] = null then
                            bestFromCombined
                        else if bestFromCombined[Similarity] = null then
                            bestFromSpaced
                        else if bestFromSpaced[Similarity] >= bestFromCombined[Similarity] then
                            bestFromSpaced
                        else
                            bestFromCombined
                in
                    FinalBest
            ),
            ValidMatches = List.Select(AllMatches, each _[Similarity] <> null),
            FinalBest =
                if List.Count(ValidMatches) = 0 then
                    [SearchedPhrase = null, MatchedPhrase = null, ReturnedPhrase = null, Similarity = null]
                else
                    let
                        topScore = List.Max(List.Transform(ValidMatches, each _[Similarity])),
                        topMatch = List.First(List.Select(ValidMatches, each _[Similarity] = topScore))
                    in
                        topMatch
        in
            FinalBest,

    // --------------------------------------------------
// MAIN MATCH & EXTRACTION PROCESS
// --------------------------------------------------
match1 = FuzzyMatch(targetText, SearchPhrase1, ReturnPhrase1, Threshold1),
match2 = if SearchPhrase2 <> null then FuzzyMatch(targetText, SearchPhrase2, ReturnPhrase2, Threshold2)
         else [SearchedPhrase = null, MatchedPhrase = null, ReturnedPhrase = null, Similarity = null],

trimmedText = FnTextBeforeSecondDelimiterIgnoreCase(targetText, match1[MatchedPhrase], match2[MatchedPhrase]),

segment1 = GetWindowedText(trimmedText, match1[MatchedPhrase], 30),
val1 = ExtractValueFromSource(segment1, match1[MatchedPhrase], direction1),
isSci1 = if val1 <> null then IsScientificText(val1) else false,
ExtractedRange1 = if isSci1 then null else GetRangeTextFromSource(segment1),
finalVal1 = if isSci1 then val1 else if ExtractedRange1 <> null then Text.BeforeDelimiter(ExtractedRange1, "-") else val1,

tempSource = if match1[MatchedPhrase] <> null then TextAfterIgnoreCase(trimmedText, match1[MatchedPhrase]) else trimmedText,

segment2 = GetWindowedText(tempSource, match2[MatchedPhrase], 30),
val2 = ExtractValueFromSource(segment2, match2[MatchedPhrase], direction2),
isSci2 = if val2 <> null then IsScientificText(val2) else false,
ExtractedRange2 = if isSci2 then null else GetRangeTextFromSource(segment2),
finalVal2 = if isSci2 then val2 else if ExtractedRange2 <> null then Text.BeforeDelimiter(ExtractedRange2, "-") else val2,

// Final number conversion using updated logic
NormalizeFinalNumber = (valueText as nullable text) as nullable number =>
    let
        raw = if valueText = null then null else Text.Trim(valueText),
        parts =
            if raw <> null and Text.Contains(raw, "10") and 
               List.AnyTrue({Text.Contains(raw, "×"), Text.Contains(raw, "x"), Text.Contains(raw, "*")})
            then
                let
                    separators = {"×", "*", "x"},
                    found = List.First(List.Select(separators, each Text.Contains(raw, _)), null),
                    split1 = if found <> null then Text.Split(raw, found) else {raw},
                    mainPart = if List.Count(split1) = 2 then Text.Trim(split1{0}) else raw,
                    split2 = if List.Count(split1) = 2 then Text.Split(Text.Trim(split1{1}), "10") else {},
                    exponent = if List.Count(split2) = 2 then Text.Trim(split2{1}) else null,
                    recombined = if exponent <> null then mainPart & "e" & exponent else raw
                in
                    recombined
            else
                raw,
        sciCleaned =
            Text.Replace(
                Text.Replace(
                    Text.Replace(
                        Text.Replace(
                            Text.Replace(parts, "×", "e"),
                            "x10^", "e"),
                        "*10^", "e"),
                    "*10", "e"),
                " x10", "e"),
        cleaned =
            if Text.Contains(sciCleaned, ",") and not Text.Contains(sciCleaned, ".")
            then Text.Replace(Text.Remove(sciCleaned, {" "}), ",", ".")
            else Text.Replace(Text.Remove(sciCleaned, {" "}), ",", ""),
        asNumber = try Number.FromText(cleaned) otherwise null
    in
        asNumber,

// FnLookupInLists definition
FnLookupInLists = (
    param1 as any,
    param2 as nullable text,
    SearchUnitList as list,
    ReturnUnitList as list,
    ConversionList as list
) as record =>
    let
        position = if param2 <> null then List.PositionOf(ReturnUnitList, param2) else -1,
        Result =
            if param2 = null then
                [Found = false, Message = "Null parameter", ConvertedValue = null]
            else if position = -1 then
                [Found = false, Message = "No match found for: " & param2, ConvertedValue = null]
            else
                let
                    searchUnit = SearchUnitList{position},
                    returnUnit = ReturnUnitList{position},
                    conversion = ConversionList{position},
                    conversionTry = try Number.From(conversion),
                    convertedValue =
                        if param1 = null then
                            null
                        else if conversionTry[HasError] then
                            try Expression.Evaluate(Text.Replace(Text.From(conversion), "x", Number.ToText(param1)), [x = param1]) otherwise null
                        else if conversionTry[Value] = 1 then
                            param1
                        else
                            param1 * conversionTry[Value]
                in
                    [
                        Found = true,
                        SearchUnit = searchUnit,
                        ReturnUnit = returnUnit,
                        Conversion = conversion,
                        ConvertedValue = convertedValue
                    ]
    in
        Result,

// Run conversions if conversion lists are provided and ReturnedPhrase is not null
Converted1 =
    if ReturnPhrase1 <> null and Conversion1 <> null and match1[ReturnedPhrase] <> null then
        FnLookupInLists(NormalizeFinalNumber(finalVal1), match1[ReturnedPhrase], SearchPhrase1, ReturnPhrase1, Conversion1)
    else
        [ConvertedValue = null],

Converted2 =
    if ReturnPhrase2 <> null and Conversion2 <> null and match2[ReturnedPhrase] <> null then
        FnLookupInLists(NormalizeFinalNumber(finalVal2), match2[ReturnedPhrase], SearchPhrase2, ReturnPhrase2, Conversion2)
    else
        [ConvertedValue = null],

// Final output
output = try [
    SearchedPhrase1 = match1[SearchedPhrase],
    MatchedPhrase1 = match1[MatchedPhrase],
    ReturnedPhrase1 = match1[ReturnedPhrase],
    Similarity1 = match1[Similarity],
    ExtractedValue1 = finalVal1,
    IsScientific1 = isSci1,
    ExtractedRange1 = ExtractedRange1,
    ExtractedNumber1 = NormalizeFinalNumber(finalVal1),
    ConvertedNumber1 = Converted1[ConvertedValue],

    SearchedPhrase2 = match2[SearchedPhrase],
    MatchedPhrase2 = match2[MatchedPhrase],
    ReturnedPhrase2 = match2[ReturnedPhrase],
    Similarity2 = match2[Similarity],
    ExtractedValue2 = finalVal2,
    IsScientific2 = isSci2,
    ExtractedRange2 = ExtractedRange2,
    ExtractedNumber2 = NormalizeFinalNumber(finalVal2),
    ConvertedNumber2 = Converted2[ConvertedValue]
] otherwise null
in
    output

本文标签: powerqueryFuzzyExtract Units and Values in Power QueryStack Overflow