v2 / thirdparty / picohttpparser / src / picohttpparser.c
685 lines · 634 sloc · 26.72 KB · 09c35acb091f95183ca22c483f7f1ab657a321af
Raw
1/*
2 * Copyright (c) 2009-2014 Kazuho Oku, Tokuhiro Matsuno, Daisuke Murase,
3 * Shigeo Mitsunari
4 *
5 * The software is licensed under either the MIT License (below) or the Perl
6 * license.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to
10 * deal in the Software without restriction, including without limitation the
11 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
12 * sell copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * IN THE SOFTWARE.
25 */
26
27#include <assert.h>
28#include <stddef.h>
29#include <string.h>
30#ifdef __SSE4_2__
31#ifdef _MSC_VER
32#include <nmmintrin.h>
33#else
34#include <x86intrin.h>
35#endif
36#endif
37#include "picohttpparser.h"
38
39#if __GNUC__ >= 3
40#define likely(x) __builtin_expect(!!(x), 1)
41#define unlikely(x) __builtin_expect(!!(x), 0)
42#else
43#define likely(x) (x)
44#define unlikely(x) (x)
45#endif
46
47#ifdef _MSC_VER
48#define ALIGNED(n) _declspec(align(n))
49#else
50#define ALIGNED(n) __attribute__((aligned(n)))
51#endif
52
53#define IS_PRINTABLE_ASCII(c) ((unsigned char)(c)-040u < 0137u)
54
55#define CHECK_EOF() \
56 if (buf == buf_end) { \
57 *ret = -2; \
58 return NULL; \
59 }
60
61#define EXPECT_CHAR_NO_CHECK(ch) \
62 if (*buf++ != ch) { \
63 *ret = -1; \
64 return NULL; \
65 }
66
67#define EXPECT_CHAR(ch) \
68 CHECK_EOF(); \
69 EXPECT_CHAR_NO_CHECK(ch);
70
71#define ADVANCE_TOKEN(tok, toklen) \
72 do { \
73 const char *tok_start = buf; \
74 static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
75 int found2; \
76 buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \
77 if (!found2) { \
78 CHECK_EOF(); \
79 } \
80 while (1) { \
81 if (*buf == ' ') { \
82 break; \
83 } else if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { \
84 if ((unsigned char)*buf < '\040' || *buf == '\177') { \
85 *ret = -1; \
86 return NULL; \
87 } \
88 } \
89 ++buf; \
90 CHECK_EOF(); \
91 } \
92 tok = tok_start; \
93 toklen = buf - tok_start; \
94 } while (0)
95
96static const char *token_char_map = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
97 "\0\1\0\1\1\1\1\1\0\0\1\1\0\1\1\0\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0"
98 "\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\1\1"
99 "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\1\0\1\0"
100 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
101 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
102 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
103 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
104
105static const char *findchar_fast(const char *buf, const char *buf_end, const char *ranges, size_t ranges_size, int *found)
106{
107 *found = 0;
108#if __SSE4_2__
109 if (likely(buf_end - buf >= 16)) {
110 __m128i ranges16 = _mm_loadu_si128((const __m128i *)ranges);
111
112 size_t left = (buf_end - buf) & ~15;
113 do {
114 __m128i b16 = _mm_loadu_si128((const __m128i *)buf);
115 int r = _mm_cmpestri(ranges16, ranges_size, b16, 16, _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS);
116 if (unlikely(r != 16)) {
117 buf += r;
118 *found = 1;
119 break;
120 }
121 buf += 16;
122 left -= 16;
123 } while (likely(left != 0));
124 }
125#else
126 /* suppress unused parameter warning */
127 (void)buf_end;
128 (void)ranges;
129 (void)ranges_size;
130#endif
131 return buf;
132}
133
134static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
135{
136 const char *token_start = buf;
137
138#ifdef __SSE4_2__
139 static const char ALIGNED(16) ranges1[16] = "\0\010" /* allow HT */
140 "\012\037" /* allow SP and up to but not including DEL */
141 "\177\177"; /* allow chars w. MSB set */
142 int found;
143 buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
144 if (found)
145 goto FOUND_CTL;
146#else
147 /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148 while (likely(buf_end - buf >= 8)) {
149#define DOIT() \
150 do { \
151 if (unlikely(!IS_PRINTABLE_ASCII(*buf))) \
152 goto NonPrintable; \
153 ++buf; \
154 } while (0)
155 DOIT();
156 DOIT();
157 DOIT();
158 DOIT();
159 DOIT();
160 DOIT();
161 DOIT();
162 DOIT();
163#undef DOIT
164 continue;
165 NonPrintable:
166 if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) {
167 goto FOUND_CTL;
168 }
169 ++buf;
170 }
171#endif
172 for (;; ++buf) {
173 CHECK_EOF();
174 if (unlikely(!IS_PRINTABLE_ASCII(*buf))) {
175 if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) {
176 goto FOUND_CTL;
177 }
178 }
179 }
180FOUND_CTL:
181 if (likely(*buf == '\015')) {
182 ++buf;
183 EXPECT_CHAR('\012');
184 *token_len = buf - 2 - token_start;
185 } else if (*buf == '\012') {
186 *token_len = buf - token_start;
187 ++buf;
188 } else {
189 *ret = -1;
190 return NULL;
191 }
192 *token = token_start;
193
194 return buf;
195}
196
197static const char *is_complete(const char *buf, const char *buf_end, size_t last_len, int *ret)
198{
199 int ret_cnt = 0;
200 buf = last_len < 3 ? buf : buf + last_len - 3;
201
202 while (1) {
203 CHECK_EOF();
204 if (*buf == '\015') {
205 ++buf;
206 CHECK_EOF();
207 EXPECT_CHAR('\012');
208 ++ret_cnt;
209 } else if (*buf == '\012') {
210 ++buf;
211 ++ret_cnt;
212 } else {
213 ++buf;
214 ret_cnt = 0;
215 }
216 if (ret_cnt == 2) {
217 return buf;
218 }
219 }
220
221 *ret = -2;
222 return NULL;
223}
224
225#define PARSE_INT(valp_, mul_) \
226 if (*buf < '0' || '9' < *buf) { \
227 buf++; \
228 *ret = -1; \
229 return NULL; \
230 } \
231 *(valp_) = (mul_) * (*buf++ - '0');
232
233#define PARSE_INT_3(valp_) \
234 do { \
235 int res_ = 0; \
236 PARSE_INT(&res_, 100) \
237 *valp_ = res_; \
238 PARSE_INT(&res_, 10) \
239 *valp_ += res_; \
240 PARSE_INT(&res_, 1) \
241 *valp_ += res_; \
242 } while (0)
243
244/* returned pointer is always within [buf, buf_end), or null */
245static const char *parse_token(const char *buf, const char *buf_end, const char **token, size_t *token_len, char next_char,
246 int *ret)
247{
248 /* We use pcmpestri to detect non-token characters. This instruction can take no more than eight character ranges (8*2*8=128
249 * bits that is the size of a SSE register). Due to this restriction, characters `|` and `~` are handled in the slow loop. */
250 static const char ALIGNED(16) ranges[] = "\x00 " /* control chars and up to SP */
251 "\"\"" /* 0x22 */
252 "()" /* 0x28,0x29 */
253 ",," /* 0x2c */
254 "//" /* 0x2f */
255 ":@" /* 0x3a-0x40 */
256 "[]" /* 0x5b-0x5d */
257 "{\xff"; /* 0x7b-0xff */
258 const char *buf_start = buf;
259 int found;
260 buf = findchar_fast(buf, buf_end, ranges, sizeof(ranges) - 1, &found);
261 if (!found) {
262 CHECK_EOF();
263 }
264 while (1) {
265 if (*buf == next_char) {
266 break;
267 } else if (!token_char_map[(unsigned char)*buf]) {
268 *ret = -1;
269 return NULL;
270 }
271 ++buf;
272 CHECK_EOF();
273 }
274 *token = buf_start;
275 *token_len = buf - buf_start;
276 return buf;
277}
278
279/* returned pointer is always within [buf, buf_end), or null */
280static const char *parse_http_version(const char *buf, const char *buf_end, int *minor_version, int *ret)
281{
282 /* we want at least [HTTP/1.<two chars>] to try to parse */
283 if (buf_end - buf < 9) {
284 *ret = -2;
285 return NULL;
286 }
287 EXPECT_CHAR_NO_CHECK('H');
288 EXPECT_CHAR_NO_CHECK('T');
289 EXPECT_CHAR_NO_CHECK('T');
290 EXPECT_CHAR_NO_CHECK('P');
291 EXPECT_CHAR_NO_CHECK('/');
292 EXPECT_CHAR_NO_CHECK('1');
293 EXPECT_CHAR_NO_CHECK('.');
294 PARSE_INT(minor_version, 1);
295 return buf;
296}
297
298static const char *parse_headers(const char *buf, const char *buf_end, struct phr_header *headers, size_t *num_headers,
299 size_t max_headers, int *ret)
300{
301 for (;; ++*num_headers) {
302 CHECK_EOF();
303 if (*buf == '\015') {
304 ++buf;
305 EXPECT_CHAR('\012');
306 break;
307 } else if (*buf == '\012') {
308 ++buf;
309 break;
310 }
311 if (*num_headers == max_headers) {
312 *ret = -1;
313 return NULL;
314 }
315 if (!(*num_headers != 0 && (*buf == ' ' || *buf == '\t'))) {
316 /* parsing name, but do not discard SP before colon, see
317 * http://www.mozilla.org/security/announce/2006/mfsa2006-33.html */
318 if ((buf = parse_token(buf, buf_end, &headers[*num_headers].name, &headers[*num_headers].name_len, ':', ret)) == NULL) {
319 return NULL;
320 }
321 if (headers[*num_headers].name_len == 0) {
322 *ret = -1;
323 return NULL;
324 }
325 ++buf;
326 for (;; ++buf) {
327 CHECK_EOF();
328 if (!(*buf == ' ' || *buf == '\t')) {
329 break;
330 }
331 }
332 } else {
333 headers[*num_headers].name = NULL;
334 headers[*num_headers].name_len = 0;
335 }
336 const char *value;
337 size_t value_len;
338 if ((buf = get_token_to_eol(buf, buf_end, &value, &value_len, ret)) == NULL) {
339 return NULL;
340 }
341 /* remove trailing SPs and HTABs */
342 const char *value_end = value + value_len;
343 for (; value_end != value; --value_end) {
344 const char c = *(value_end - 1);
345 if (!(c == ' ' || c == '\t')) {
346 break;
347 }
348 }
349 headers[*num_headers].value = value;
350 headers[*num_headers].value_len = value_end - value;
351 }
352 return buf;
353}
354
355static const char *parse_request(const char *buf, const char *buf_end, const char **method, size_t *method_len, const char **path,
356 size_t *path_len, int *minor_version, struct phr_header *headers, size_t *num_headers,
357 size_t max_headers, int *ret)
358{
359 /* skip first empty line (some clients add CRLF after POST content) */
360 CHECK_EOF();
361 if (*buf == '\015') {
362 ++buf;
363 EXPECT_CHAR('\012');
364 } else if (*buf == '\012') {
365 ++buf;
366 }
367
368 /* parse request line */
369 if ((buf = parse_token(buf, buf_end, method, method_len, ' ', ret)) == NULL) {
370 return NULL;
371 }
372 do {
373 ++buf;
374 CHECK_EOF();
375 } while (*buf == ' ');
376 ADVANCE_TOKEN(*path, *path_len);
377 do {
378 ++buf;
379 CHECK_EOF();
380 } while (*buf == ' ');
381 if (*method_len == 0 || *path_len == 0) {
382 *ret = -1;
383 return NULL;
384 }
385 if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) {
386 return NULL;
387 }
388 if (*buf == '\015') {
389 ++buf;
390 EXPECT_CHAR('\012');
391 } else if (*buf == '\012') {
392 ++buf;
393 } else {
394 *ret = -1;
395 return NULL;
396 }
397
398 return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret);
399}
400
401int phr_parse_request(const char *buf_start, size_t len, const char **method, size_t *method_len, const char **path,
402 size_t *path_len, int *minor_version, struct phr_header *headers, size_t *num_headers, size_t last_len)
403{
404 const char *buf = buf_start, *buf_end = buf_start + len;
405 size_t max_headers = *num_headers;
406 int r;
407
408 *method = NULL;
409 *method_len = 0;
410 *path = NULL;
411 *path_len = 0;
412 *minor_version = -1;
413 *num_headers = 0;
414
415 /* if last_len != 0, check if the request is complete (a fast countermeasure
416 againt slowloris */
417 if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
418 return r;
419 }
420
421 if ((buf = parse_request(buf, buf_end, method, method_len, path, path_len, minor_version, headers, num_headers, max_headers,
422 &r)) == NULL) {
423 return r;
424 }
425
426 return (int)(buf - buf_start);
427}
428
429static const char *parse_response(const char *buf, const char *buf_end, int *minor_version, int *status, const char **msg,
430 size_t *msg_len, struct phr_header *headers, size_t *num_headers, size_t max_headers, int *ret)
431{
432 /* parse "HTTP/1.x" */
433 if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) {
434 return NULL;
435 }
436 /* skip space */
437 if (*buf != ' ') {
438 *ret = -1;
439 return NULL;
440 }
441 do {
442 ++buf;
443 CHECK_EOF();
444 } while (*buf == ' ');
445 /* parse status code, we want at least [:digit:][:digit:][:digit:]<other char> to try to parse */
446 if (buf_end - buf < 4) {
447 *ret = -2;
448 return NULL;
449 }
450 PARSE_INT_3(status);
451
452 /* get message including preceding space */
453 if ((buf = get_token_to_eol(buf, buf_end, msg, msg_len, ret)) == NULL) {
454 return NULL;
455 }
456 if (*msg_len == 0) {
457 /* ok */
458 } else if (**msg == ' ') {
459 /* Remove preceding space. Successful return from `get_token_to_eol` guarantees that we would hit something other than SP
460 * before running past the end of the given buffer. */
461 do {
462 ++*msg;
463 --*msg_len;
464 } while (**msg == ' ');
465 } else {
466 /* garbage found after status code */
467 *ret = -1;
468 return NULL;
469 }
470
471 return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret);
472}
473
474int phr_parse_response(const char *buf_start, size_t len, int *minor_version, int *status, const char **msg, size_t *msg_len,
475 struct phr_header *headers, size_t *num_headers, size_t last_len)
476{
477 const char *buf = buf_start, *buf_end = buf + len;
478 size_t max_headers = *num_headers;
479 int r;
480
481 *minor_version = -1;
482 *status = 0;
483 *msg = NULL;
484 *msg_len = 0;
485 *num_headers = 0;
486
487 /* if last_len != 0, check if the response is complete (a fast countermeasure
488 against slowloris */
489 if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
490 return r;
491 }
492
493 if ((buf = parse_response(buf, buf_end, minor_version, status, msg, msg_len, headers, num_headers, max_headers, &r)) == NULL) {
494 return r;
495 }
496
497 return (int)(buf - buf_start);
498}
499
500int phr_parse_headers(const char *buf_start, size_t len, struct phr_header *headers, size_t *num_headers, size_t last_len)
501{
502 const char *buf = buf_start, *buf_end = buf + len;
503 size_t max_headers = *num_headers;
504 int r;
505
506 *num_headers = 0;
507
508 /* if last_len != 0, check if the response is complete (a fast countermeasure
509 against slowloris */
510 if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
511 return r;
512 }
513
514 if ((buf = parse_headers(buf, buf_end, headers, num_headers, max_headers, &r)) == NULL) {
515 return r;
516 }
517
518 return (int)(buf - buf_start);
519}
520
521enum {
522 CHUNKED_IN_CHUNK_SIZE,
523 CHUNKED_IN_CHUNK_EXT,
524 CHUNKED_IN_CHUNK_DATA,
525 CHUNKED_IN_CHUNK_CRLF,
526 CHUNKED_IN_TRAILERS_LINE_HEAD,
527 CHUNKED_IN_TRAILERS_LINE_MIDDLE
528};
529
530static int decode_hex(int ch)
531{
532 if ('0' <= ch && ch <= '9') {
533 return ch - '0';
534 } else if ('A' <= ch && ch <= 'F') {
535 return ch - 'A' + 0xa;
536 } else if ('a' <= ch && ch <= 'f') {
537 return ch - 'a' + 0xa;
538 } else {
539 return -1;
540 }
541}
542
543ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *_bufsz)
544{
545 size_t dst = 0, src = 0, bufsz = *_bufsz;
546 ssize_t ret = -2; /* incomplete */
547
548 decoder->_total_read += bufsz;
549
550 while (1) {
551 switch (decoder->_state) {
552 case CHUNKED_IN_CHUNK_SIZE:
553 for (;; ++src) {
554 int v;
555 if (src == bufsz)
556 goto Exit;
557 if ((v = decode_hex(buf[src])) == -1) {
558 if (decoder->_hex_count == 0) {
559 ret = -1;
560 goto Exit;
561 }
562 /* the only characters that may appear after the chunk size are BWS, semicolon, or CRLF */
563 switch (buf[src]) {
564 case ' ':
565 case '\011':
566 case ';':
567 case '\012':
568 case '\015':
569 break;
570 default:
571 ret = -1;
572 goto Exit;
573 }
574 break;
575 }
576 if (decoder->_hex_count == sizeof(size_t) * 2) {
577 ret = -1;
578 goto Exit;
579 }
580 decoder->bytes_left_in_chunk = decoder->bytes_left_in_chunk * 16 + v;
581 ++decoder->_hex_count;
582 }
583 decoder->_hex_count = 0;
584 decoder->_state = CHUNKED_IN_CHUNK_EXT;
585 /* fallthru */
586 case CHUNKED_IN_CHUNK_EXT:
587 /* RFC 7230 A.2 "Line folding in chunk extensions is disallowed" */
588 for (;; ++src) {
589 if (src == bufsz)
590 goto Exit;
591 if (buf[src] == '\012')
592 break;
593 }
594 ++src;
595 if (decoder->bytes_left_in_chunk == 0) {
596 if (decoder->consume_trailer) {
597 decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD;
598 break;
599 } else {
600 goto Complete;
601 }
602 }
603 decoder->_state = CHUNKED_IN_CHUNK_DATA;
604 /* fallthru */
605 case CHUNKED_IN_CHUNK_DATA: {
606 size_t avail = bufsz - src;
607 if (avail < decoder->bytes_left_in_chunk) {
608 if (dst != src)
609 memmove(buf + dst, buf + src, avail);
610 src += avail;
611 dst += avail;
612 decoder->bytes_left_in_chunk -= avail;
613 goto Exit;
614 }
615 if (dst != src)
616 memmove(buf + dst, buf + src, decoder->bytes_left_in_chunk);
617 src += decoder->bytes_left_in_chunk;
618 dst += decoder->bytes_left_in_chunk;
619 decoder->bytes_left_in_chunk = 0;
620 decoder->_state = CHUNKED_IN_CHUNK_CRLF;
621 }
622 /* fallthru */
623 case CHUNKED_IN_CHUNK_CRLF:
624 for (;; ++src) {
625 if (src == bufsz)
626 goto Exit;
627 if (buf[src] != '\015')
628 break;
629 }
630 if (buf[src] != '\012') {
631 ret = -1;
632 goto Exit;
633 }
634 ++src;
635 decoder->_state = CHUNKED_IN_CHUNK_SIZE;
636 break;
637 case CHUNKED_IN_TRAILERS_LINE_HEAD:
638 for (;; ++src) {
639 if (src == bufsz)
640 goto Exit;
641 if (buf[src] != '\015')
642 break;
643 }
644 if (buf[src++] == '\012')
645 goto Complete;
646 decoder->_state = CHUNKED_IN_TRAILERS_LINE_MIDDLE;
647 /* fallthru */
648 case CHUNKED_IN_TRAILERS_LINE_MIDDLE:
649 for (;; ++src) {
650 if (src == bufsz)
651 goto Exit;
652 if (buf[src] == '\012')
653 break;
654 }
655 ++src;
656 decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD;
657 break;
658 default:
659 assert(!"decoder is corrupt");
660 }
661 }
662
663Complete:
664 ret = bufsz - src;
665Exit:
666 if (dst != src)
667 memmove(buf + dst, buf + src, bufsz - src);
668 *_bufsz = dst;
669 /* if incomplete but the overhead of the chunked encoding is >=100KB and >80%, signal an error */
670 if (ret == -2) {
671 decoder->_total_overhead += bufsz - dst;
672 if (decoder->_total_overhead >= 100 * 1024 && decoder->_total_read - decoder->_total_overhead < decoder->_total_read / 4)
673 ret = -1;
674 }
675 return ret;
676}
677
678int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder)
679{
680 return decoder->_state == CHUNKED_IN_CHUNK_DATA;
681}
682
683#undef CHECK_EOF
684#undef EXPECT_CHAR
685#undef ADVANCE_TOKEN
686