-
Notifications
You must be signed in to change notification settings - Fork 126
/
policy.xml
171 lines (153 loc) · 8.73 KB
/
policy.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
<policies>
<inbound>
<base />
<!-- Getting the main variable where we keep the list of backends -->
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<!-- If we can't find the variable, initialize it -->
<choose>
<when condition="@(context.Variables.ContainsKey("listBackends") == false)">
<!-- Variable with the load balancer config stored in a named value -->
<set-variable name="loadBalancerConfig" value="{{openai-lb-config}}" />
<set-variable name="listBackends" value="@{
// -------------------------------------------------
// ------- Explanation of backend properties -------
// -------------------------------------------------
// "name": Name of the backend
// "priority": Lower value means higher priority over other backends.
// If you have more one or more Priority 1 backends, they will always be used instead
// of Priority 2 or higher. Higher values backends will only be used if your lower values (top priority) are all throttling.
// "isThrottling": Indicates if this endpoint is returning 429 (Too many requests) currently
// "retryAfter": We use it to know when to mark this endpoint as healthy again after we received a 429 response
var openAIConfig = JArray.Parse((string)context.Variables["loadBalancerConfig"]);
JArray backends = new JArray();
foreach (JObject config in openAIConfig)
{
backends.Add(new JObject()
{
{ "name", config.GetValue("name").ToString() },
{ "priority", config.GetValue("priority").Value<int>() },
{ "weight", config.GetValue("weight").Value<int>() },
{ "isThrottling", false },
{ "retryAfter", DateTime.MinValue }
});
}
return backends;
}" />
<!-- And store the variable into cache again -->
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
</when>
</choose>
<set-variable name="backendIndex" value="-1" />
<set-variable name="remainingBackends" value="1" />
<authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="managed-id-access-token" ignore-error="false" />
<set-header name="Authorization" exists-action="override">
<value>@("Bearer " + (string)context.Variables["managed-id-access-token"])</value>
</set-header>
</inbound>
<backend>
<retry condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) && (Convert.ToInt32(context.Variables["remainingBackends"]) > 0))" count="50" interval="0">
<!-- Before picking the backend, let's verify if there is any that should be set to not throttling anymore -->
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
for (int i = 0; i < backends.Count; i++)
{
JObject backend = (JObject)backends[i];
if (backend.Value<bool>("isThrottling") && DateTime.Now >= backend.Value<DateTime>("retryAfter"))
{
backend["isThrottling"] = false;
backend["retryAfter"] = DateTime.MinValue;
}
}
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<!-- This is the main logic to pick the backend to be used -->
<set-variable name="backendIndex" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int selectedPriority = Int32.MaxValue;
List<int> availableBackends = new List<int>();
for (int i = 0; i < backends.Count; i++)
{
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
{
int backendPriority = backend.Value<int>("priority");
if (backendPriority < selectedPriority)
{
selectedPriority = backendPriority;
availableBackends.Clear();
availableBackends.Add(i);
}
else if (backendPriority == selectedPriority)
{
availableBackends.Add(i);
}
}
}
if (availableBackends.Count == 1)
{
return availableBackends[0];
}
if (availableBackends.Count > 0)
{
//Returns a random backend from the list if we have more than one available with the same priority
return availableBackends[new Random().Next(0, availableBackends.Count)];
}
else
{
//If there are no available backends, the request will be sent to the first one
return 0;
}
}" />
<set-backend-service backend-id="@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("name"))" />
<forward-request buffer-request-body="true" />
<choose>
<!-- In case we got 429 or 5xx from a backend, update the list with its status -->
<when condition="@(context.Response != null && (context.Response.StatusCode == 429 || context.Response.StatusCode >= 500) )">
<cache-lookup-value key="listBackends" variable-name="listBackends" />
<set-variable name="listBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int currentBackendIndex = context.Variables.GetValueOrDefault<int>("backendIndex");
int retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("Retry-After", "-1"));
if (retryAfter == -1)
{
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-requests", "-1"));
}
if (retryAfter == -1)
{
retryAfter = Convert.ToInt32(context.Response.Headers.GetValueOrDefault("x-ratelimit-reset-tokens", "10"));
}
JObject backend = (JObject)backends[currentBackendIndex];
backend["isThrottling"] = true;
backend["retryAfter"] = DateTime.Now.AddSeconds(retryAfter);
return backends;
}" />
<cache-store-value key="listBackends" value="@((JArray)context.Variables["listBackends"])" duration="60" />
<set-variable name="remainingBackends" value="@{
JArray backends = (JArray)context.Variables["listBackends"];
int remainingBackends = 0;
for (int i = 0; i < backends.Count; i++)
{
JObject backend = (JObject)backends[i];
if (!backend.Value<bool>("isThrottling"))
{
remainingBackends++;
}
}
return remainingBackends;
}" />
</when>
</choose>
</retry>
</backend>
<outbound>
<base />
<!-- This will return the used backend URL in the HTTP header response. Remove it if you don't want to expose this data -->
<set-header name="x-ms-openai" exists-action="override">
<value>@(((JObject)((JArray)context.Variables["listBackends"])[(Int32)context.Variables["backendIndex"]]).Value<string>("name"))</value>
</set-header>
</outbound>
<on-error>
<base />
</on-error>
</policies>