diff --git a/internal/labels/labels.go b/internal/labels/labels.go index 6632908..dd9382e 100644 --- a/internal/labels/labels.go +++ b/internal/labels/labels.go @@ -14,9 +14,12 @@ import ( "fmt" "html/template" "log" + "regexp" + "strings" "golang.org/x/oscar/internal/github" "golang.org/x/oscar/internal/llm" + "rsc.io/markdown" ) // A Category is a classification for an issue. @@ -38,11 +41,12 @@ func IssueCategoryFromList(ctx context.Context, cgen llm.ContentGenerator, iss * return Category{}, "", errors.New("issue is a pull request") } + body := cleanIssueBody(iss.Body) // Extract issue text into a string. var issueText bytes.Buffer - err = template.Must(template.New("body").Parse(body)).Execute(&issueText, bodyArgs{ + err = template.Must(template.New("body").Parse(bodyTemplate)).Execute(&issueText, bodyArgs{ Title: iss.Title, - Body: iss.Body, + Body: body, }) if err != nil { return Category{}, "", err @@ -73,6 +77,57 @@ func IssueCategoryFromList(ctx context.Context, cgen llm.ContentGenerator, iss * return Category{}, "", fmt.Errorf("no category matches LLM response %q", jsonRes) } +// TODO(jba): this is approximate. +// See https://developer.mozilla.org/en-US/docs/Web/HTML/Comments for the exact syntax. +var htmlCommentRegexp = regexp.MustCompile(``) + +// cleanIssueBody adjusts the issue body to improve the odds that it will be properly +// labeled. +func cleanIssueBody(text string) string { + // TODO(jba): These settings are also used in fix.go to parse bodies. Factor out. + p := &markdown.Parser{ + AutoLinkText: true, + Strikethrough: true, + HeadingIDs: true, + Emoji: true, + } + doc := p.Parse(text) + + var cleanBlock func(markdown.Block) + cleanBlock = func(x markdown.Block) { + switch x := x.(type) { + case *markdown.Document: + for _, sub := range x.Blocks { + cleanBlock(sub) + } + case *markdown.HTMLBlock: + // Delete comments. + // Each Text is a line. + t := strings.Join(x.Text, "\n") + t = htmlCommentRegexp.ReplaceAllString(t, "") + x.Text = strings.Split(t, "\n") + case *markdown.Quote: + for _, sub := range x.Blocks { + cleanBlock(sub) + } + case *markdown.List: + for _, sub := range x.Items { + cleanBlock(sub) + } + case *markdown.Item: + for _, sub := range x.Blocks { + cleanBlock(sub) + } + case *markdown.Heading: + cleanBlock(x.Text) + case *markdown.Paragraph: + cleanBlock(x.Text) + } + } + cleanBlock(doc) + return markdown.Format(doc) +} + // response is the response that should generated by the LLM. // It must match [responseSchema]. type response struct { @@ -102,7 +157,7 @@ Report the category of the issue and an explanation of your decision. Each category and its description are listed below. ` -const body = ` +const bodyTemplate = ` The title of the issue is: {{.Title}} The body of the issue is: {{.Body}} ` diff --git a/internal/labels/labels_test.go b/internal/labels/labels_test.go index 4dc3cf3..e0fb0ba 100644 --- a/internal/labels/labels_test.go +++ b/internal/labels/labels_test.go @@ -39,3 +39,30 @@ func kindTestGenerator() llm.ContentGenerator { return `{"CategoryName":"other","Explanation":"whatever"}`, nil }) } + +func TestCleanIssueBody(t *testing.T) { + for _, tc := range []struct { + in string + want string + }{ + {"", ""}, + {"# H\nword\nword2\n", "# H\n\nword\nword2\n"}, + { + "\n### H3\n done", + "\n\n### H3\n\n done\n", + }, + { + "\n### H3\n\ndone", + "\n\n### H3\n\n\n\ndone\n", + }, + { + " b -->", + " b -->\n", + }, + } { + got := cleanIssueBody(tc.in) + if got != tc.want { + t.Errorf("%q:\ngot %q\nwant %q", tc.in, got, tc.want) + } + } +}