-
Notifications
You must be signed in to change notification settings - Fork 39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
InternetAddress parsing is unacceptably slow #126
Comments
Whenever a solution is chosen, I can try implement it in a library |
GString is essentially a StringBuilder (if you are referring to the .NET StringBuilder class). |
g_string_new ("") calls g_string_sized_new(2), but g_string_sized_new() uses MAX(dfl_size, 64), so it should allocate with an initial buffer of 64 bytes. https://github.com/GNOME/glib/blob/main/glib/gstring.c#L108 That should be plenty for your addrspec (which is 24 characters), so it shouldn't need to resize. |
It might be possible to refactor the code to re-use the same GString over and over for building the addrspecs (just use g_string_truncate(str, 0) at the start of addrspec_parse) and then just g_malloc()/g_strlcpy() the buffer into a new char* string. |
That's an interesting idea!
|
Here's an implementation of my idea, but it seems to somehow be slower on my macOS box (arm64): diff --git a/gmime/internet-address.c b/gmime/internet-address.c
index ae07d13f..08209973 100644
--- a/gmime/internet-address.c
+++ b/gmime/internet-address.c
@@ -98,7 +98,7 @@ enum {
INTERNET_ADDRESS_FOLD = 1 << 1,
};
-static gboolean addrspec_parse (const char **in, const char *sentinels, char **addrspec, int *at);
+static gboolean addrspec_parse (GString *buffer, const char **in, const char *sentinels, char **addrspec, int *at);
static void internet_address_class_init (InternetAddressClass *klass);
static void internet_address_init (InternetAddress *ia, InternetAddressClass *klass);
@@ -387,14 +387,18 @@ internet_address_mailbox_new (const char *name, const char *addr)
{
InternetAddressMailbox *mailbox;
const char *inptr = addr;
+ GString *buffer;
g_return_val_if_fail (addr != NULL, NULL);
mailbox = g_object_new (INTERNET_ADDRESS_TYPE_MAILBOX, NULL);
+ buffer = g_string_sized_new (64);
- if (!addrspec_parse (&inptr, "", &mailbox->addr, &mailbox->at))
+ if (!addrspec_parse (buffer, &inptr, "", &mailbox->addr, &mailbox->at))
mailbox->addr = g_strdup (addr);
-
+
+ g_string_free (buffer, TRUE);
+
_internet_address_set_name ((InternetAddress *) mailbox, name);
return (InternetAddress *) mailbox;
@@ -412,6 +416,7 @@ void
internet_address_mailbox_set_addr (InternetAddressMailbox *mailbox, const char *addr)
{
const char *inptr = addr;
+ GString *buffer;
g_return_if_fail (INTERNET_ADDRESS_IS_MAILBOX (mailbox));
@@ -423,9 +428,13 @@ internet_address_mailbox_set_addr (InternetAddressMailbox *mailbox, const char *
g_free (mailbox->addr);
- if (!addrspec_parse (&inptr, "", &mailbox->addr, &mailbox->at))
+ buffer = g_string_sized_new (64);
+
+ if (!addrspec_parse (buffer, &inptr, "", &mailbox->addr, &mailbox->at))
mailbox->addr = g_strdup (addr);
-
+
+ g_string_free (buffer, TRUE);
+
g_mime_event_emit (((InternetAddress *) mailbox)->changed, NULL);
}
@@ -1632,19 +1641,26 @@ domain_parse (GString *str, const char **in, const char *sentinels)
return dotatom_parse (str, in, sentinels);
}
+static char *
+buffer_strdup (GString *buffer)
+{
+ char *str = g_malloc (buffer->len + 1);
+ strcpy (str, buffer->str);
+ return str;
+}
+
static gboolean
-addrspec_parse (const char **in, const char *sentinels, char **addrspec, int *at)
+addrspec_parse (GString *buffer, const char **in, const char *sentinels, char **addrspec, int *at)
{
const char *inptr = *in;
- GString *str;
-
- str = g_string_new ("");
-
- if (!localpart_parse (str, &inptr))
+
+ g_string_truncate (buffer, 0);
+
+ if (!localpart_parse (buffer, &inptr))
goto error;
if (*inptr == '\0' || strchr (sentinels, *inptr)) {
- *addrspec = g_string_free (str, FALSE);
+ *addrspec = buffer_strdup (buffer);
*in = inptr;
*at = -1;
return TRUE;
@@ -1653,8 +1669,8 @@ addrspec_parse (const char **in, const char *sentinels, char **addrspec, int *at
if (*inptr != '@')
goto error;
- *at = str->len;
- g_string_append_c (str, *inptr++);
+ *at = buffer->len;
+ g_string_append_c (buffer, *inptr++);
if (*inptr == '\0')
goto error;
@@ -1665,16 +1681,15 @@ addrspec_parse (const char **in, const char *sentinels, char **addrspec, int *at
if (*inptr == '\0')
goto error;
- if (!domain_parse (str, &inptr, sentinels))
+ if (!domain_parse (buffer, &inptr, sentinels))
goto error;
- *addrspec = g_string_free (str, FALSE);
+ *addrspec = buffer_strdup (buffer);
*in = inptr;
return TRUE;
error:
- g_string_free (str, TRUE);
*addrspec = NULL;
*in = inptr;
*at = -1;
@@ -1684,7 +1699,7 @@ addrspec_parse (const char **in, const char *sentinels, char **addrspec, int *at
// TODO: rename to angleaddr_parse??
static gboolean
-mailbox_parse (GMimeParserOptions *options, const char **in, const char *name, InternetAddress **address)
+mailbox_parse (GMimeParserOptions *options, GString *buffer, const char **in, const char *name, InternetAddress **address)
{
GMimeRfcComplianceMode mode = g_mime_parser_options_get_address_compliance_mode (options);
const char *inptr = *in;
@@ -1728,7 +1743,7 @@ mailbox_parse (GMimeParserOptions *options, const char **in, const char *name, I
// ';' as well in case the mailbox is within a group address.
//
// Example: <[email protected], [email protected]>
- if (!addrspec_parse (&inptr, COMMA_GREATER_THAN_OR_SEMICOLON, &addrspec, &at))
+ if (!addrspec_parse (buffer, &inptr, COMMA_GREATER_THAN_OR_SEMICOLON, &addrspec, &at))
goto error;
if (!skip_cfws (&inptr))
@@ -1797,7 +1812,7 @@ group_parse (InternetAddressGroup *group, GMimeParserOptions *options, const cha
}
static gboolean
-address_parse (GMimeParserOptions *options, AddressParserFlags flags, const char **in, const char **charset, InternetAddress **address, gint64 offset)
+address_parse (GMimeParserOptions *options, AddressParserFlags flags, GString *buffer, const char **in, const char **charset, InternetAddress **address, gint64 offset)
{
GMimeRfcComplianceMode mode = g_mime_parser_options_get_address_compliance_mode (options);
int min_words = g_mime_parser_options_get_allow_addresses_without_domain (options) ? 1 : 0;
@@ -1885,7 +1900,7 @@ address_parse (GMimeParserOptions *options, AddressParserFlags flags, const char
if (!(flags & ALLOW_MAILBOX))
goto error;
- if (!addrspec_parse (&inptr, sentinels, &addrspec, &at))
+ if (!addrspec_parse (buffer, &inptr, sentinels, &addrspec, &at))
goto error;
skip_lwsp (&inptr);
@@ -1961,7 +1976,7 @@ address_parse (GMimeParserOptions *options, AddressParserFlags flags, const char
/* rewind back to the beginning of the local-part */
inptr = start;
- if (!addrspec_parse (&inptr, COMMA_GREATER_THAN_OR_SEMICOLON, &addrspec, &at))
+ if (!addrspec_parse (buffer, &inptr, COMMA_GREATER_THAN_OR_SEMICOLON, &addrspec, &at))
goto error;
skip_lwsp (&inptr);
@@ -2048,7 +2063,7 @@ address_parse (GMimeParserOptions *options, AddressParserFlags flags, const char
name = g_strdup ("");
}
- retval = mailbox_parse (options, &inptr, name, address);
+ retval = mailbox_parse (options, buffer, &inptr, name, address);
g_free (name);
*in = inptr;
@@ -2072,6 +2087,7 @@ address_list_parse (InternetAddressList *list, GMimeParserOptions *options, cons
InternetAddress *address;
const char *charset;
const char *inptr;
+ GString *buffer;
if (!skip_cfws (in))
return FALSE;
@@ -2080,7 +2096,9 @@ address_list_parse (InternetAddressList *list, GMimeParserOptions *options, cons
if (*inptr == '\0')
return FALSE;
-
+
+ buffer = g_string_sized_new (64);
+
while (*inptr) {
gboolean separator_between_addrs = FALSE;
@@ -2089,7 +2107,7 @@ address_list_parse (InternetAddressList *list, GMimeParserOptions *options, cons
charset = NULL;
- if (!address_parse (options, ALLOW_ANY, &inptr, &charset, &address, offset)) {
+ if (!address_parse (options, ALLOW_ANY, buffer, &inptr, &charset, &address, offset)) {
/* skip this address... */
while (*inptr && *inptr != ',' && (!is_group || *inptr != ';'))
inptr++;
@@ -2106,6 +2124,7 @@ address_list_parse (InternetAddressList *list, GMimeParserOptions *options, cons
/* Note: we loop here in case there are any null addresses between commas */
do {
if (!skip_cfws (&inptr)) {
+ g_string_free (buffer, TRUE);
*in = inptr;
return FALSE;
@@ -2121,7 +2140,9 @@ address_list_parse (InternetAddressList *list, GMimeParserOptions *options, cons
if (can_warn && !(separator_between_addrs || (*inptr == '\0') || (is_group && *inptr == ';')))
_g_mime_parser_options_warn (options, offset, GMIME_WARN_INVALID_ADDRESS_LIST, *in);
}
-
+
+ g_string_free (buffer, TRUE);
+
*in = inptr;
return TRUE; |
@sigasigasiga my patch should theoretically get better perf than yours since g_strdup() has to call strlen() on the string, then malloc(), then strcpy() whereas mine just uses the str->len (well, I renamed it to buffer, but same thing) to avoid the strlen() call. That said, I also did not experience any improvement (in fact, it somehow got slower?). |
@jstedfast I have another idea how to reduce allocations: store addresses in a
This way |
Most likely there's something more costly than the allocations |
Tested a theory I had which is that it's not the parser that is slow at all, but rather the fact that there is a separate If I modify the message such that all of the addresses are in a single
By comparison, this is what I get w/ your original submitted test message:
|
Here's the modified message that puts all of the addresses in a single |
…ations If a message has a ton of To: headers, for example, we were allocating a new InternetAddressList for each To: header and concatenating them onto a single list after parsing them. This improves performance by ~20% for those types of cases. Partial fix for issue #126
With the above patch applied, I now get:
|
I think I know how to fix the rest of the performance problem... it'll just take some time because I don't have a lot of free time to work on this. |
Nevermind, wasn't that hard. |
Wow, that's really impressive! Thank you! |
New timings:
|
Oops, fixed the build (I had accidentally committed part of another local optimization idea that went unused) |
@jstedfast would you mind creating a new release with this fix in it? |
released 3.2.13 |
Thank you very much! |
Did this change some behaviour? Running the notmuch testsuite with 3.2.13 exposes the following failure: |
It shouldn't have |
I recommend re-opening this issue, because i don't think the changes made to resolve matters addressed the underlying problem; rather, they just worked optimized for a specific sample message. In particular, i think the problem is that the interaction in I might be misunderstanding the code here, but it looks to me like as a parser passes through a message's header fields, it invokes In the old code, which invoked only So if you have one So it's not surprising that a message with 1000 |
With the old code, that was exactly the problem. The The new logic checks to see if the header was appended (vs inserted), and if it was appended, it calls Thus, the new implementation is O(N) instead of O(N²). This was done in commit 4a80ae5. |
I understand that the "new implementation" should be O(N), but the "old implementation" is still there, right? What's stopping it from getting triggered is the logic in This feels a bit like it's laying a small booby trap in the code -- a reader can only understand why it's doing it this way if they see how the different pieces fit together, and it's likely to be brittle if someone makes a change (to either piece of code). If someone tries to build a message from scratch in a weird way, for example, by building a header block that ends in I don't really have a good way forward to propose concretely, so i'll understand if you just decide to leave it as-is. I just wanted to make sure i'm understanding the situation clearly. Thanks for following up about this! |
(Mostly) True, but that particular use-case is the hardest-hit just due to the fact that the parser adds a whole slew of headers to the message in a loop (and in particular, the address headers which must be combined). The other case this handles (hence the "Mostly" above) is developers doing As you correctly point out, if they do That said, there probably aren't many GMime developers who are doing that. Most likely just let GMimeMessage handle serialization of addresses. The exception to this is most likely to be prepending "Received" headers (which I know GMime is used for outside of notmuch and other user-facing email clients). Luckily, GMime will not hit any sort of O(N²) situation for Received headers.
I actually happen to like the older RFCs for allowing multiple To/Cc/Bcc/etc headers, I think having multiple headers helps to make the raw headers look nicer than 1 address header with a huge list of folded addresses. But that's just my quirkiness, I'm sure :-)
There's 2 more things we can do that I've already done in MimeKit:
The first optimization is what helped make MimeKit about as fast as GMime back in the early days of MimeKit development. The second optimization is more recent (part of MimeKit v3.0 development) and made a pretty massive performance improvement in MimeKit's MimeParser when I implemented that (not really a surprise seeing as how it defers a ton of expensive parsing until later when the user actually wants to get access to those parsed addresses/etc). That work was done as part of this optimization effort: jstedfast/MimeKit#695 (comment) I'm sure there's a ton more optimizations we could pull from that since MimeKit and GMime are so similarly designed. But anyway, yea... current MimeKit v3.x versions have 3 parsers:
The thread that I linked above has a bunch of benchmark results that show the gains made by the newer parsers.
|
As subject says,
InternetAddress
parsing is very slow.If I try to parse an email with like 30000 recipients, it takes about 800 seconds to finish on my machine (i7-3770).
I took a look at what Callgrind says:
And we can clearly see that all the slowness comes from
addrspec_parse
function, whereGString
is made for every recipient, and then it does reallocations many many times.Currently I can think of only 2 ways to somehow fix this:
StringBuilder
and use it to create resulting address in order to reduce reallocation timeThe text was updated successfully, but these errors were encountered: