Skip to content

Commit

Permalink
fix unicode decoder
Browse files Browse the repository at this point in the history
  • Loading branch information
zmh-program committed Dec 12, 2023
1 parent bfac02d commit 8b3041d
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 9 deletions.
8 changes: 7 additions & 1 deletion adapter/chatgpt/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,13 @@ func getRobustnessResult(chunk string) string {

matches := compile.FindStringSubmatch(chunk)
if len(matches) > 1 {
return matches[1]
partial := matches[1]
// if the unicode character is in the string, like `hi\\u2019s`, we need to convert it to `hi's`
if utils.ContainUnicode(partial) {
partial = utils.DecodeUnicode(partial)
}

return partial
} else {
return ""
}
Expand Down
6 changes: 3 additions & 3 deletions adapter/oneapi/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ func getRobustnessResult(chunk string) string {
matches := compile.FindStringSubmatch(chunk)
if len(matches) > 1 {
partial := matches[1]
// if is the unicode character
if strings.HasPrefix(partial, "\\u") {
return utils.DecodeUnicode(partial)
// if the unicode character is in the string, like `hi\\u2019s`, we need to convert it to `hi's`
if utils.ContainUnicode(partial) {
partial = utils.DecodeUnicode(partial)
}

return partial
Expand Down
15 changes: 10 additions & 5 deletions utils/char.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,16 +152,21 @@ func ExtractImageUrls(data string) []string {
return re.FindAllString(data, -1)
}

func ContainUnicode(data string) bool {
// like `hi\\u2019s` => true
re := regexp.MustCompile(`\\u([0-9a-fA-F]{4})`)
return re.MatchString(data)
}

func DecodeUnicode(data string) string {
// like `hi\\u2019s` => `hi's`
re := regexp.MustCompile(`\\u([0-9a-fA-F]{4})`)
return re.ReplaceAllStringFunc(data, func(s string) string {
if len(s) < 6 {
return s
}
val, err := strconv.ParseInt(s[2:], 16, 32)
unicode, err := strconv.ParseInt(s[2:], 16, 32)
if err != nil {
return s
}
return strconv.FormatInt(val, 10)

return string(rune(unicode))
})
}

0 comments on commit 8b3041d

Please sign in to comment.