admin管理员组

文章数量:1344975

I'm trying to extract structured data from a git log output which looks like this:

sha:"1ac31eadbe9cdf4d365de68b24a5daa2ab9c2575" refs:[HEAD->release/1.6.0;tag:release/1.6.0/api;origin/release/1.6.0] notes:[];
sha:"95c8adff0ec6a3064aa03f395e7cce63dd7cb21b" refs:[] notes:[];
sha:"2877f582fe1a4cdbf339d3183fca3da2e496b90b" refs:[] notes:[];
sha:"fbcd95c938e42c360b4ad9227ee05475e5308dd8" refs:[tag:release/1.5.0/api;tag:release/1.5.0/worker;tag:release/1.5.0/docs;origin/release/1.5.0] notes:[utc20241017065043 release/1.5.0/api

utc20250401123529 release/1.5.0/worker

utc20250401172120 release/1.5.0/docs
];
sha:"d520de3061cc71ab887b9dd924d8e432e4673127" refs:[] notes:[];
sha:"9f869c42e1209108fc061d383e48ae00e6dba21d" refs:[tag:1.0.0;tag:1.0-preview;origin/1.0.0] notes:[utc20250101163814 1.0.0
];
sha:"0b41aaa2a72b3029d6878bb827a03dc99188d1c5" refs:[] notes:[];
sha:"12b8220819a71b8a02c5ab542b4fae81d9644112" refs:[origin/experiment-123] notes:[];
sha:"3de7d9b9ed2dcf28a1aaa6db79239c55ce0ef48e" refs:[] notes:[];

(Some entries contain multi-line notes and multiple refs).

The Git command looks like this:

git log --pretty='tformat:sha:"%H" refs:[%(decorate:pointer=->,prefix=,suffix=,tag=tag:,separator=;)] notes:[%N];' --notes=custom-notes

I've tried adapting Superpower's JsonParser sample code to this task:

using System.Linq;
using Superpower;
using Superpower.Display;
using Superpower.Model;
using Superpower.Parsers;
using Superpower.Tokenizers;

enum GitLogToken
{
    [Token(Example = "[")]
    LSquareBracket,

    [Token(Example = "]")]
    RSquareBracket,

    [Token(Example = ",")]
    Comma,

    [Token(Example = ":")]
    Colon,

    [Token(Example = ";")]
    SemiColon,

    String,

    KeyPrefix,
}

static class GitLogTokenizer
{
    static TextParser<Unit> GitLogStringToken { get; } =
        from open in Character.EqualTo('"')
        from content in Character.Except('"').Value(Unit.Value)
            .IgnoreMany()
        from close in Character.EqualTo('"')
        select Unit.Value;

    public static Tokenizer<GitLogToken> Instance { get; } =
        new TokenizerBuilder<GitLogToken>()
            .Ignore(Span.WhiteSpace)
            .Match(Character.EqualTo(','), GitLogToken.Comma)
            .Match(Character.EqualTo(':'), GitLogToken.Colon)
            .Match(Character.EqualTo(';'), GitLogToken.SemiColon)
            .Match(Character.EqualTo('['), GitLogToken.LSquareBracket)
            .Match(Character.EqualTo(']'), GitLogToken.RSquareBracket)
            .Match(GitLogStringToken, GitLogToken.String)
            // Identifier.CStyle was supposed to tokenize `sha`, `refs` and `notes` prefixes. Not sure.
            .Match(Identifier.CStyle, GitLogToken.KeyPrefix, requireDelimiters: false)
            .Build();
}

static class GitLogTextParsers
{
    public static TextParser<string> String { get; } =
        from open in Character.EqualTo('"')
        from chars in Character.ExceptIn('"')
            .Many()
        from close in Character.EqualTo('"')
        select new string(chars);
}

static class GitLogParser
{
    static TokenListParser<GitLogToken, object> GitLogString { get; } =
        Token.EqualTo(GitLogToken.String)
            .Apply(GitLogTextParsers.String)
            .Select(s => (object)s);

    static TokenListParser<GitLogToken, object> GitLogArray { get; } =
        from begin in Token.EqualTo(GitLogToken.LSquareBracket)
        from values in Parse.Ref(() => GitLogEntry!)
            .ManyDelimitedBy(Token.EqualTo(GitLogToken.Comma),
                end: Token.EqualTo(GitLogToken.RSquareBracket))
        select (object)values;

    static TokenListParser<GitLogToken, object?> GitLogEntry { get; } =
        GitLogString.AsNullable()
            .Or(GitLogArray.AsNullable())
            // TODO: Each entry ends with semi-colon ';'
            .Named("GitLog value");

    // TODO: Each GitLogEntry can have array of notes. Notes are separated by newlines.

    // TODO: Each GitLogEntry can have array of refs. Refs are separated by semi-colons.

    // TODO: Multiple GitLogEntry items inside "log document".

    static TokenListParser<GitLogToken, object?> GitLogDocument { get; } = GitLogEntry.AtEnd();

    public static bool TryParse(string gitLog, out object? value, out string? error, out Position errorPosition)
    {
        var tokens = GitLogTokenizer.Instance.TryTokenize(gitLog);
        if (!tokens.HasValue) {
            value = null;
            error = tokens.ToString();
            errorPosition = tokens.ErrorPosition;
            return false;
        }

        var parsed = GitLogDocument.TryParse(tokens.Value);
        if (!parsed.HasValue) {
            value = null;
            error = parsed.ToString();
            errorPosition = parsed.ErrorPosition;
            return false;
        }

        value = parsed.Value;
        error = null;
        errorPosition = Position.Empty;
        return true;
    }
}

I don't know where to go from here.

Can anyone direct me to more suitable Superpower examples for these kinds of inputs (i.e. logs or textual data with ad hoc markup)?

本文标签: cExtract structured data from Git log output using SuperpowerStack Overflow