ggdgsdbsdbbb / markdown / thirdparty / md4c / md4c.c
6413 lines · 5434 sloc · 218.82 KB · 8dec386d5f65c9eed2c7c24c74374dee9d07525f
Raw
1/*
2 * MD4C: Markdown parser for C
3 * (http://github.com/mity/md4c)
4 *
5 * Copyright (c) 2016-2020 Martin Mitas
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26#include "md4c.h"
27
28#include <limits.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33
34/*****************************
35 *** Miscellaneous Stuff ***
36 *****************************/
37
38#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
39 /* C89/90 or old compilers in general may not understand "inline". */
40 #if defined __GNUC__
41 #define inline __inline__
42 #elif defined _MSC_VER
43 #define inline __inline
44 #else
45 #define inline
46 #endif
47#endif
48
49/* Make the UTF-8 support the default. */
50#if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
51 #define MD4C_USE_UTF8
52#endif
53
54/* Magic for making wide literals with MD4C_USE_UTF16. */
55#ifdef _T
56 #undef _T
57#endif
58#if defined MD4C_USE_UTF16
59 #define _T(x) L##x
60#else
61 #define _T(x) x
62#endif
63
64/* Misc. macros. */
65#define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0]))
66
67#define STRINGIZE_(x) #x
68#define STRINGIZE(x) STRINGIZE_(x)
69
70#ifndef TRUE
71 #define TRUE 1
72 #define FALSE 0
73#endif
74
75#define MD_LOG(msg) \
76 do { \
77 if(ctx->parser.debug_log != NULL) \
78 ctx->parser.debug_log((msg), ctx->userdata); \
79 } while(0)
80
81#ifdef DEBUG
82 #define MD_ASSERT(cond) \
83 do { \
84 if(!(cond)) { \
85 MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \
86 "Assertion '" STRINGIZE(cond) "' failed."); \
87 exit(1); \
88 } \
89 } while(0)
90
91 #define MD_UNREACHABLE() MD_ASSERT(1 == 0)
92#else
93 #if defined __GNUC__ && !defined __TINYC__
94 #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0)
95 #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0)
96 #elif defined _MSC_VER && _MSC_VER > 120
97 #define MD_ASSERT(cond) do { __assume(cond); } while(0)
98 #define MD_UNREACHABLE() do { __assume(0); } while(0)
99 #else
100 #define MD_ASSERT(cond) do {} while(0)
101 #define MD_UNREACHABLE() do {} while(0)
102 #endif
103#endif
104
105/* For falling through case labels in switch statements. */
106#if defined __clang__ && __clang_major__ >= 12
107 #define MD_FALLTHROUGH() __attribute__((fallthrough))
108#elif defined __GNUC__ && __GNUC__ >= 7
109 #define MD_FALLTHROUGH() __attribute__((fallthrough))
110#else
111 #define MD_FALLTHROUGH() ((void)0)
112#endif
113
114/* Suppress "unused parameter" warnings. */
115#define MD_UNUSED(x) ((void)x)
116
117
118/************************
119 *** Internal Types ***
120 ************************/
121
122/* These are omnipresent so lets save some typing. */
123#define CHAR MD_CHAR
124#define SZ MD_SIZE
125#define OFF MD_OFFSET
126
127typedef struct MD_MARK_tag MD_MARK;
128typedef struct MD_BLOCK_tag MD_BLOCK;
129typedef struct MD_CONTAINER_tag MD_CONTAINER;
130typedef struct MD_REF_DEF_tag MD_REF_DEF;
131
132
133/* During analyzes of inline marks, we need to manage some "mark chains",
134 * of (yet unresolved) openers. This structure holds start/end of the chain.
135 * The chain internals are then realized through MD_MARK::prev and ::next.
136 */
137typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN;
138struct MD_MARKCHAIN_tag {
139 int head; /* Index of first mark in the chain, or -1 if empty. */
140 int tail; /* Index of last mark in the chain, or -1 if empty. */
141};
142
143/* Context propagated through all the parsing. */
144typedef struct MD_CTX_tag MD_CTX;
145struct MD_CTX_tag {
146 /* Immutable stuff (parameters of md_parse()). */
147 const CHAR* text;
148 SZ size;
149 MD_PARSER parser;
150 void* userdata;
151
152 /* When this is true, it allows some optimizations. */
153 int doc_ends_with_newline;
154
155 /* Helper temporary growing buffer. */
156 CHAR* buffer;
157 unsigned alloc_buffer;
158
159 /* Reference definitions. */
160 MD_REF_DEF* ref_defs;
161 int n_ref_defs;
162 int alloc_ref_defs;
163 void** ref_def_hashtable;
164 int ref_def_hashtable_size;
165
166 /* Stack of inline/span markers.
167 * This is only used for parsing a single block contents but by storing it
168 * here we may reuse the stack for subsequent blocks; i.e. we have fewer
169 * (re)allocations. */
170 MD_MARK* marks;
171 int n_marks;
172 int alloc_marks;
173
174#if defined MD4C_USE_UTF16
175 char mark_char_map[128];
176#else
177 char mark_char_map[256];
178#endif
179
180 /* For resolving of inline spans. */
181 MD_MARKCHAIN mark_chains[13];
182#define PTR_CHAIN (ctx->mark_chains[0])
183#define TABLECELLBOUNDARIES (ctx->mark_chains[1])
184#define ASTERISK_OPENERS_extraword_mod3_0 (ctx->mark_chains[2])
185#define ASTERISK_OPENERS_extraword_mod3_1 (ctx->mark_chains[3])
186#define ASTERISK_OPENERS_extraword_mod3_2 (ctx->mark_chains[4])
187#define ASTERISK_OPENERS_intraword_mod3_0 (ctx->mark_chains[5])
188#define ASTERISK_OPENERS_intraword_mod3_1 (ctx->mark_chains[6])
189#define ASTERISK_OPENERS_intraword_mod3_2 (ctx->mark_chains[7])
190#define UNDERSCORE_OPENERS (ctx->mark_chains[8])
191#define TILDE_OPENERS_1 (ctx->mark_chains[9])
192#define TILDE_OPENERS_2 (ctx->mark_chains[10])
193#define BRACKET_OPENERS (ctx->mark_chains[11])
194#define DOLLAR_OPENERS (ctx->mark_chains[12])
195#define OPENERS_CHAIN_FIRST 1
196#define OPENERS_CHAIN_LAST 12
197
198 int n_table_cell_boundaries;
199
200 /* For resolving links. */
201 int unresolved_link_head;
202 int unresolved_link_tail;
203
204 /* For resolving raw HTML. */
205 OFF html_comment_horizon;
206 OFF html_proc_instr_horizon;
207 OFF html_decl_horizon;
208 OFF html_cdata_horizon;
209
210 /* For block analysis.
211 * Notes:
212 * -- It holds MD_BLOCK as well as MD_LINE structures. After each
213 * MD_BLOCK, its (multiple) MD_LINE(s) follow.
214 * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
215 * instead of MD_LINE(s).
216 */
217 void* block_bytes;
218 MD_BLOCK* current_block;
219 int n_block_bytes;
220 int alloc_block_bytes;
221
222 /* For container block analysis. */
223 MD_CONTAINER* containers;
224 int n_containers;
225 int alloc_containers;
226
227 /* Minimal indentation to call the block "indented code block". */
228 unsigned code_indent_offset;
229
230 /* Contextual info for line analysis. */
231 SZ code_fence_length; /* For checking closing fence length. */
232 int html_block_type; /* For checking closing raw HTML condition. */
233 int last_line_has_list_loosening_effect;
234 int last_list_item_starts_with_two_blank_lines;
235};
236
237enum MD_LINETYPE_tag {
238 MD_LINE_BLANK,
239 MD_LINE_HR,
240 MD_LINE_ATXHEADER,
241 MD_LINE_SETEXTHEADER,
242 MD_LINE_SETEXTUNDERLINE,
243 MD_LINE_INDENTEDCODE,
244 MD_LINE_FENCEDCODE,
245 MD_LINE_HTML,
246 MD_LINE_TEXT,
247 MD_LINE_TABLE,
248 MD_LINE_TABLEUNDERLINE
249};
250typedef enum MD_LINETYPE_tag MD_LINETYPE;
251
252typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
253struct MD_LINE_ANALYSIS_tag {
254 MD_LINETYPE type : 16;
255 unsigned data : 16;
256 OFF beg;
257 OFF end;
258 unsigned indent; /* Indentation level. */
259};
260
261typedef struct MD_LINE_tag MD_LINE;
262struct MD_LINE_tag {
263 OFF beg;
264 OFF end;
265};
266
267typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
268struct MD_VERBATIMLINE_tag {
269 OFF beg;
270 OFF end;
271 OFF indent;
272};
273
274
275/*****************
276 *** Helpers ***
277 *****************/
278
279/* Character accessors. */
280#define CH(off) (ctx->text[(off)])
281#define STR(off) (ctx->text + (off))
282
283/* Character classification.
284 * Note we assume ASCII compatibility of code points < 128 here. */
285#define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
286#define ISANYOF_(ch, palette) ((ch) != _T('\0') && md_strchr((palette), (ch)) != NULL)
287#define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2))
288#define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
289#define ISASCII_(ch) ((unsigned)(ch) <= 127)
290#define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t')))
291#define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n')))
292#define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
293#define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
294#define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
295#define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z')))
296#define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z')))
297#define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch))
298#define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9')))
299#define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
300#define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch))
301
302#define ISANYOF(off, palette) ISANYOF_(CH(off), (palette))
303#define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2))
304#define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
305#define ISASCII(off) ISASCII_(CH(off))
306#define ISBLANK(off) ISBLANK_(CH(off))
307#define ISNEWLINE(off) ISNEWLINE_(CH(off))
308#define ISWHITESPACE(off) ISWHITESPACE_(CH(off))
309#define ISCNTRL(off) ISCNTRL_(CH(off))
310#define ISPUNCT(off) ISPUNCT_(CH(off))
311#define ISUPPER(off) ISUPPER_(CH(off))
312#define ISLOWER(off) ISLOWER_(CH(off))
313#define ISALPHA(off) ISALPHA_(CH(off))
314#define ISDIGIT(off) ISDIGIT_(CH(off))
315#define ISXDIGIT(off) ISXDIGIT_(CH(off))
316#define ISALNUM(off) ISALNUM_(CH(off))
317
318
319#if defined MD4C_USE_UTF16
320 #define md_strchr wcschr
321#else
322 #define md_strchr strchr
323#endif
324
325
326/* Case insensitive check of string equality. */
327static inline int
328md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
329{
330 OFF i;
331 for(i = 0; i < n; i++) {
332 CHAR ch1 = s1[i];
333 CHAR ch2 = s2[i];
334
335 if(ISLOWER_(ch1))
336 ch1 += ('A'-'a');
337 if(ISLOWER_(ch2))
338 ch2 += ('A'-'a');
339 if(ch1 != ch2)
340 return FALSE;
341 }
342 return TRUE;
343}
344
345static inline int
346md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
347{
348 return memcmp(s1, s2, n * sizeof(CHAR)) == 0;
349}
350
351static int
352md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
353{
354 OFF off = 0;
355 int ret = 0;
356
357 while(1) {
358 while(off < size && str[off] != _T('\0'))
359 off++;
360
361 if(off > 0) {
362 ret = ctx->parser.text(type, str, off, ctx->userdata);
363 if(ret != 0)
364 return ret;
365
366 str += off;
367 size -= off;
368 off = 0;
369 }
370
371 if(off >= size)
372 return 0;
373
374 ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
375 if(ret != 0)
376 return ret;
377 off++;
378 }
379}
380
381
382#define MD_CHECK(func) \
383 do { \
384 ret = (func); \
385 if(ret < 0) \
386 goto abort; \
387 } while(0)
388
389
390#define MD_TEMP_BUFFER(sz) \
391 do { \
392 if(sz > ctx->alloc_buffer) { \
393 CHAR* new_buffer; \
394 SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \
395 \
396 new_buffer = realloc(ctx->buffer, new_size); \
397 if(new_buffer == NULL) { \
398 MD_LOG("realloc() failed."); \
399 ret = -1; \
400 goto abort; \
401 } \
402 \
403 ctx->buffer = new_buffer; \
404 ctx->alloc_buffer = new_size; \
405 } \
406 } while(0)
407
408
409#define MD_ENTER_BLOCK(type, arg) \
410 do { \
411 ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \
412 if(ret != 0) { \
413 MD_LOG("Aborted from enter_block() callback."); \
414 goto abort; \
415 } \
416 } while(0)
417
418#define MD_LEAVE_BLOCK(type, arg) \
419 do { \
420 ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \
421 if(ret != 0) { \
422 MD_LOG("Aborted from leave_block() callback."); \
423 goto abort; \
424 } \
425 } while(0)
426
427#define MD_ENTER_SPAN(type, arg) \
428 do { \
429 ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \
430 if(ret != 0) { \
431 MD_LOG("Aborted from enter_span() callback."); \
432 goto abort; \
433 } \
434 } while(0)
435
436#define MD_LEAVE_SPAN(type, arg) \
437 do { \
438 ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \
439 if(ret != 0) { \
440 MD_LOG("Aborted from leave_span() callback."); \
441 goto abort; \
442 } \
443 } while(0)
444
445#define MD_TEXT(type, str, size) \
446 do { \
447 if(size > 0) { \
448 ret = ctx->parser.text((type), (str), (size), ctx->userdata); \
449 if(ret != 0) { \
450 MD_LOG("Aborted from text() callback."); \
451 goto abort; \
452 } \
453 } \
454 } while(0)
455
456#define MD_TEXT_INSECURE(type, str, size) \
457 do { \
458 if(size > 0) { \
459 ret = md_text_with_null_replacement(ctx, type, str, size); \
460 if(ret != 0) { \
461 MD_LOG("Aborted from text() callback."); \
462 goto abort; \
463 } \
464 } \
465 } while(0)
466
467
468/* If the offset falls into a gap between line, we return the following
469 * line. */
470static const MD_LINE*
471md_lookup_line(OFF off, const MD_LINE* lines, int n_lines)
472{
473 int lo, hi;
474 int pivot;
475 const MD_LINE* line;
476
477 lo = 0;
478 hi = n_lines - 1;
479 while(lo <= hi) {
480 pivot = (lo + hi) / 2;
481 line = &lines[pivot];
482
483 if(off < line->beg) {
484 hi = pivot - 1;
485 if(hi < 0 || lines[hi].end <= off)
486 return line;
487 } else if(off > line->end) {
488 lo = pivot + 1;
489 } else {
490 return line;
491 }
492 }
493
494 return NULL;
495}
496
497
498/*************************
499 *** Unicode Support ***
500 *************************/
501
502typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
503struct MD_UNICODE_FOLD_INFO_tag {
504 unsigned codepoints[3];
505 unsigned n_codepoints;
506};
507
508
509#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
510 /* Binary search over sorted "map" of codepoints. Consecutive sequences
511 * of codepoints may be encoded in the map by just using the
512 * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
513 *
514 * Returns index of the found record in the map (in the case of ranges,
515 * the minimal value is used); or -1 on failure. */
516 static int
517 md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
518 {
519 int beg, end;
520 int pivot_beg, pivot_end;
521
522 beg = 0;
523 end = (int) map_size-1;
524 while(beg <= end) {
525 /* Pivot may be a range, not just a single value. */
526 pivot_beg = pivot_end = (beg + end) / 2;
527 if(map[pivot_end] & 0x40000000)
528 pivot_end++;
529 if(map[pivot_beg] & 0x80000000)
530 pivot_beg--;
531
532 if(codepoint < (map[pivot_beg] & 0x00ffffff))
533 end = pivot_beg - 1;
534 else if(codepoint > (map[pivot_end] & 0x00ffffff))
535 beg = pivot_end + 1;
536 else
537 return pivot_beg;
538 }
539
540 return -1;
541 }
542
543 static int
544 md_is_unicode_whitespace__(unsigned codepoint)
545 {
546#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
547#define S(cp) (cp)
548 /* Unicode "Zs" category.
549 * (generated by scripts/build_whitespace_map.py) */
550 static const unsigned WHITESPACE_MAP[] = {
551 S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
552 };
553#undef R
554#undef S
555
556 /* The ASCII ones are the most frequently used ones, also CommonMark
557 * specification requests few more in this range. */
558 if(codepoint <= 0x7f)
559 return ISWHITESPACE_(codepoint);
560
561 return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
562 }
563
564 static int
565 md_is_unicode_punct__(unsigned codepoint)
566 {
567#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
568#define S(cp) (cp)
569 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
570 * (generated by scripts/build_punct_map.py) */
571 static const unsigned PUNCT_MAP[] = {
572 R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040),
573 R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7),
574 S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0),
575 S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f),
576 R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e),
577 R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f),
578 R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4),
579 R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c),
580 R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a),
581 R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60),
582 R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027),
583 R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e),
584 R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef),
585 R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
586 R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f),
587 S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
588 R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
589 S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1),
590 S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68),
591 R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b),
592 R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102),
593 S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
594 R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
595 R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175),
596 R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9),
597 R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643),
598 R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46),
599 R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8),
600 S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44),
601 R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f)
602 };
603#undef R
604#undef S
605
606 /* The ASCII ones are the most frequently used ones, also CommonMark
607 * specification requests few more in this range. */
608 if(codepoint <= 0x7f)
609 return ISPUNCT_(codepoint);
610
611 return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
612 }
613
614 static void
615 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
616 {
617#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
618#define S(cp) (cp)
619 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
620 * (generated by scripts/build_folding_map.py) */
621 static const unsigned FOLD_MAP_1[] = {
622 R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
623 R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
624 S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
625 S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
626 R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
627 S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
628 S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
629 R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
630 S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
631 S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
632 S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
633 S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
634 R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
635 R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
636 S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
637 R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
638 R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
639 R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
640 S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
641 S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
642 R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
643 S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
644 S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
645 S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
646 R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
647 S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5),
648 R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2),
649 R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
650 };
651 static const unsigned FOLD_MAP_1_DATA[] = {
652 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
653 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
654 0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
655 0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
656 0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
657 0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
658 0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
659 0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
660 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
661 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
662 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
663 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
664 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
665 0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
666 0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
667 0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
668 0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
669 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
670 0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
671 0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41,
672 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922,
673 0x1e943
674 };
675 static const unsigned FOLD_MAP_2[] = {
676 S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
677 S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
678 R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
679 S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
680 S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
681 S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
682 };
683 static const unsigned FOLD_MAP_2_DATA[] = {
684 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
685 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
686 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
687 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
688 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
689 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
690 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
691 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
692 };
693 static const unsigned FOLD_MAP_3[] = {
694 S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
695 S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
696 };
697 static const unsigned FOLD_MAP_3_DATA[] = {
698 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
699 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
700 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
701 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
702 };
703#undef R
704#undef S
705 static const struct {
706 const unsigned* map;
707 const unsigned* data;
708 size_t map_size;
709 unsigned n_codepoints;
710 } FOLD_MAP_LIST[] = {
711 { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
712 { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
713 { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
714 };
715
716 int i;
717
718 /* Fast path for ASCII characters. */
719 if(codepoint <= 0x7f) {
720 info->codepoints[0] = codepoint;
721 if(ISUPPER_(codepoint))
722 info->codepoints[0] += 'a' - 'A';
723 info->n_codepoints = 1;
724 return;
725 }
726
727 /* Try to locate the codepoint in any of the maps. */
728 for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
729 int index;
730
731 index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size);
732 if(index >= 0) {
733 /* Found the mapping. */
734 unsigned n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
735 const unsigned* map = FOLD_MAP_LIST[i].map;
736 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
737
738 memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints);
739 info->n_codepoints = n_codepoints;
740
741 if(FOLD_MAP_LIST[i].map[index] != codepoint) {
742 /* The found mapping maps whole range of codepoints,
743 * i.e. we have to offset info->codepoints[0] accordingly. */
744 if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
745 /* Alternating type of the range. */
746 info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
747 } else {
748 /* Range to range kind of mapping. */
749 info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
750 }
751 }
752
753 return;
754 }
755 }
756
757 /* No mapping found. Map the codepoint to itself. */
758 info->codepoints[0] = codepoint;
759 info->n_codepoints = 1;
760 }
761#endif
762
763
764#if defined MD4C_USE_UTF16
765 #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800)
766 #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00)
767 #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
768
769 static unsigned
770 md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
771 {
772 if(IS_UTF16_SURROGATE_HI(str[0])) {
773 if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
774 if(p_size != NULL)
775 *p_size = 2;
776 return UTF16_DECODE_SURROGATE(str[0], str[1]);
777 }
778 }
779
780 if(p_size != NULL)
781 *p_size = 1;
782 return str[0];
783 }
784
785 static unsigned
786 md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
787 {
788 if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
789 return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
790
791 return CH(off);
792 }
793
794 /* No whitespace uses surrogates, so no decoding needed here. */
795 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
796 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off))
797 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1))
798
799 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
800 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
801
802 static inline int
803 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
804 {
805 return md_decode_utf16le__(str+off, str_size-off, p_char_size);
806 }
807#elif defined MD4C_USE_UTF8
808 #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f)
809 #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0)
810 #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0)
811 #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0)
812 #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80)
813
814 static unsigned
815 md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
816 {
817 if(!IS_UTF8_LEAD1(str[0])) {
818 if(IS_UTF8_LEAD2(str[0])) {
819 if(1 < str_size && IS_UTF8_TAIL(str[1])) {
820 if(p_size != NULL)
821 *p_size = 2;
822
823 return (((unsigned int)str[0] & 0x1f) << 6) |
824 (((unsigned int)str[1] & 0x3f) << 0);
825 }
826 } else if(IS_UTF8_LEAD3(str[0])) {
827 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
828 if(p_size != NULL)
829 *p_size = 3;
830
831 return (((unsigned int)str[0] & 0x0f) << 12) |
832 (((unsigned int)str[1] & 0x3f) << 6) |
833 (((unsigned int)str[2] & 0x3f) << 0);
834 }
835 } else if(IS_UTF8_LEAD4(str[0])) {
836 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
837 if(p_size != NULL)
838 *p_size = 4;
839
840 return (((unsigned int)str[0] & 0x07) << 18) |
841 (((unsigned int)str[1] & 0x3f) << 12) |
842 (((unsigned int)str[2] & 0x3f) << 6) |
843 (((unsigned int)str[3] & 0x3f) << 0);
844 }
845 }
846 }
847
848 if(p_size != NULL)
849 *p_size = 1;
850 return (unsigned) str[0];
851 }
852
853 static unsigned
854 md_decode_utf8_before__(MD_CTX* ctx, OFF off)
855 {
856 if(!IS_UTF8_LEAD1(CH(off-1))) {
857 if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
858 return (((unsigned int)CH(off-2) & 0x1f) << 6) |
859 (((unsigned int)CH(off-1) & 0x3f) << 0);
860
861 if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
862 return (((unsigned int)CH(off-3) & 0x0f) << 12) |
863 (((unsigned int)CH(off-2) & 0x3f) << 6) |
864 (((unsigned int)CH(off-1) & 0x3f) << 0);
865
866 if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
867 return (((unsigned int)CH(off-4) & 0x07) << 18) |
868 (((unsigned int)CH(off-3) & 0x3f) << 12) |
869 (((unsigned int)CH(off-2) & 0x3f) << 6) |
870 (((unsigned int)CH(off-1) & 0x3f) << 0);
871 }
872
873 return (unsigned) CH(off-1);
874 }
875
876 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
877 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
878 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
879
880 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
881 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
882
883 static inline unsigned
884 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
885 {
886 return md_decode_utf8__(str+off, str_size-off, p_char_size);
887 }
888#else
889 #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
890 #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off)
891 #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1)
892
893 #define ISUNICODEPUNCT(off) ISPUNCT(off)
894 #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1)
895
896 static inline void
897 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
898 {
899 info->codepoints[0] = codepoint;
900 if(ISUPPER_(codepoint))
901 info->codepoints[0] += 'a' - 'A';
902 info->n_codepoints = 1;
903 }
904
905 static inline unsigned
906 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
907 {
908 *p_size = 1;
909 return (unsigned) str[off];
910 }
911#endif
912
913
914/*************************************
915 *** Helper string manipulations ***
916 *************************************/
917
918/* Fill buffer with copy of the string between 'beg' and 'end' but replace any
919 * line breaks with given replacement character.
920 *
921 * NOTE: Caller is responsible to make sure the buffer is large enough.
922 * (Given the output is always shorter then input, (end - beg) is good idea
923 * what the caller should allocate.)
924 */
925static void
926md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
927 CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
928{
929 CHAR* ptr = buffer;
930 int line_index = 0;
931 OFF off = beg;
932
933 MD_UNUSED(n_lines);
934
935 while(1) {
936 const MD_LINE* line = &lines[line_index];
937 OFF line_end = line->end;
938 if(end < line_end)
939 line_end = end;
940
941 while(off < line_end) {
942 *ptr = CH(off);
943 ptr++;
944 off++;
945 }
946
947 if(off >= end) {
948 *p_size = (MD_SIZE)(ptr - buffer);
949 return;
950 }
951
952 *ptr = line_break_replacement_char;
953 ptr++;
954
955 line_index++;
956 off = lines[line_index].beg;
957 }
958}
959
960/* Wrapper of md_merge_lines() which allocates new buffer for the output string.
961 */
962static int
963md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
964 CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
965{
966 CHAR* buffer;
967
968 buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg));
969 if(buffer == NULL) {
970 MD_LOG("malloc() failed.");
971 return -1;
972 }
973
974 md_merge_lines(ctx, beg, end, lines, n_lines,
975 line_break_replacement_char, buffer, p_size);
976
977 *p_str = buffer;
978 return 0;
979}
980
981static OFF
982md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
983{
984 SZ char_size;
985 unsigned codepoint;
986
987 while(off < size) {
988 codepoint = md_decode_unicode(label, off, size, &char_size);
989 if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off]))
990 break;
991 off += char_size;
992 }
993
994 return off;
995}
996
997
998/******************************
999 *** Recognizing raw HTML ***
1000 ******************************/
1001
1002/* md_is_html_tag() may be called when processing inlines (inline raw HTML)
1003 * or when breaking document to blocks (checking for start of HTML block type 7).
1004 *
1005 * When breaking document to blocks, we do not yet know line boundaries, but
1006 * in that case the whole tag has to live on a single line. We distinguish this
1007 * by n_lines == 0.
1008 */
1009static int
1010md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1011{
1012 int attr_state;
1013 OFF off = beg;
1014 OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
1015 int i = 0;
1016
1017 MD_ASSERT(CH(beg) == _T('<'));
1018
1019 if(off + 1 >= line_end)
1020 return FALSE;
1021 off++;
1022
1023 /* For parsing attributes, we need a little state automaton below.
1024 * State -1: no attributes are allowed.
1025 * State 0: attribute could follow after some whitespace.
1026 * State 1: after a whitespace (attribute name may follow).
1027 * State 2: after attribute name ('=' MAY follow).
1028 * State 3: after '=' (value specification MUST follow).
1029 * State 41: in middle of unquoted attribute value.
1030 * State 42: in middle of single-quoted attribute value.
1031 * State 43: in middle of double-quoted attribute value.
1032 */
1033 attr_state = 0;
1034
1035 if(CH(off) == _T('/')) {
1036 /* Closer tag "</ ... >". No attributes may be present. */
1037 attr_state = -1;
1038 off++;
1039 }
1040
1041 /* Tag name */
1042 if(off >= line_end || !ISALPHA(off))
1043 return FALSE;
1044 off++;
1045 while(off < line_end && (ISALNUM(off) || CH(off) == _T('-')))
1046 off++;
1047
1048 /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1049 * and final '>'. */
1050 while(1) {
1051 while(off < line_end && !ISNEWLINE(off)) {
1052 if(attr_state > 40) {
1053 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1054 attr_state = 0;
1055 off--; /* Put the char back for re-inspection in the new state. */
1056 } else if(attr_state == 42 && CH(off) == _T('\'')) {
1057 attr_state = 0;
1058 } else if(attr_state == 43 && CH(off) == _T('"')) {
1059 attr_state = 0;
1060 }
1061 off++;
1062 } else if(ISWHITESPACE(off)) {
1063 if(attr_state == 0)
1064 attr_state = 1;
1065 off++;
1066 } else if(attr_state <= 2 && CH(off) == _T('>')) {
1067 /* End. */
1068 goto done;
1069 } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1070 /* End with digraph '/>' */
1071 off++;
1072 goto done;
1073 } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1074 off++;
1075 /* Attribute name */
1076 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1077 off++;
1078 attr_state = 2;
1079 } else if(attr_state == 2 && CH(off) == _T('=')) {
1080 /* Attribute assignment sign */
1081 off++;
1082 attr_state = 3;
1083 } else if(attr_state == 3) {
1084 /* Expecting start of attribute value. */
1085 if(CH(off) == _T('"'))
1086 attr_state = 43;
1087 else if(CH(off) == _T('\''))
1088 attr_state = 42;
1089 else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off))
1090 attr_state = 41;
1091 else
1092 return FALSE;
1093 off++;
1094 } else {
1095 /* Anything unexpected. */
1096 return FALSE;
1097 }
1098 }
1099
1100 /* We have to be on a single line. See definition of start condition
1101 * of HTML block, type 7. */
1102 if(n_lines == 0)
1103 return FALSE;
1104
1105 i++;
1106 if(i >= n_lines)
1107 return FALSE;
1108
1109 off = lines[i].beg;
1110 line_end = lines[i].end;
1111
1112 if(attr_state == 0 || attr_state == 41)
1113 attr_state = 1;
1114
1115 if(off >= max_end)
1116 return FALSE;
1117 }
1118
1119done:
1120 if(off >= max_end)
1121 return FALSE;
1122
1123 *p_end = off+1;
1124 return TRUE;
1125}
1126
1127static int
1128md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1129 const MD_LINE* lines, int n_lines,
1130 OFF beg, OFF max_end, OFF* p_end,
1131 OFF* p_scan_horizon)
1132{
1133 OFF off = beg;
1134 int i = 0;
1135
1136 if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) {
1137 /* We have already scanned the range up to the max_end so we know
1138 * there is nothing to see. */
1139 return FALSE;
1140 }
1141
1142 while(TRUE) {
1143 while(off + len <= lines[i].end && off + len <= max_end) {
1144 if(md_ascii_eq(STR(off), str, len)) {
1145 /* Success. */
1146 *p_end = off + len;
1147 return TRUE;
1148 }
1149 off++;
1150 }
1151
1152 i++;
1153 if(off >= max_end || i >= n_lines) {
1154 /* Failure. */
1155 *p_scan_horizon = off;
1156 return FALSE;
1157 }
1158
1159 off = lines[i].beg;
1160 }
1161}
1162
1163static int
1164md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1165{
1166 OFF off = beg;
1167
1168 MD_ASSERT(CH(beg) == _T('<'));
1169
1170 if(off + 4 >= lines[0].end)
1171 return FALSE;
1172 if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-'))
1173 return FALSE;
1174 off += 4;
1175
1176 /* ">" and "->" must not follow the opening. */
1177 if(off < lines[0].end && CH(off) == _T('>'))
1178 return FALSE;
1179 if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>'))
1180 return FALSE;
1181
1182 /* HTML comment must not contain "--", so we scan just for "--" instead
1183 * of "-->" and verify manually that '>' follows. */
1184 if(md_scan_for_html_closer(ctx, _T("--"), 2,
1185 lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon))
1186 {
1187 if(*p_end < max_end && CH(*p_end) == _T('>')) {
1188 *p_end = *p_end + 1;
1189 return TRUE;
1190 }
1191 }
1192
1193 return FALSE;
1194}
1195
1196static int
1197md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1198{
1199 OFF off = beg;
1200
1201 if(off + 2 >= lines[0].end)
1202 return FALSE;
1203 if(CH(off+1) != _T('?'))
1204 return FALSE;
1205 off += 2;
1206
1207 return md_scan_for_html_closer(ctx, _T("?>"), 2,
1208 lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon);
1209}
1210
1211static int
1212md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1213{
1214 OFF off = beg;
1215
1216 if(off + 2 >= lines[0].end)
1217 return FALSE;
1218 if(CH(off+1) != _T('!'))
1219 return FALSE;
1220 off += 2;
1221
1222 /* Declaration name. */
1223 if(off >= lines[0].end || !ISALPHA(off))
1224 return FALSE;
1225 off++;
1226 while(off < lines[0].end && ISALPHA(off))
1227 off++;
1228 if(off < lines[0].end && !ISWHITESPACE(off))
1229 return FALSE;
1230
1231 return md_scan_for_html_closer(ctx, _T(">"), 1,
1232 lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon);
1233}
1234
1235static int
1236md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1237{
1238 static const CHAR open_str[] = _T("<![CDATA[");
1239 static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1240
1241 OFF off = beg;
1242
1243 if(off + open_size >= lines[0].end)
1244 return FALSE;
1245 if(memcmp(STR(off), open_str, open_size) != 0)
1246 return FALSE;
1247 off += open_size;
1248
1249 if(lines[n_lines-1].end < max_end)
1250 max_end = lines[n_lines-1].end - 2;
1251
1252 return md_scan_for_html_closer(ctx, _T("]]>"), 3,
1253 lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon);
1254}
1255
1256static int
1257md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1258{
1259 MD_ASSERT(CH(beg) == _T('<'));
1260 return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) ||
1261 md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) ||
1262 md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) ||
1263 md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) ||
1264 md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1265}
1266
1267
1268/****************************
1269 *** Recognizing Entity ***
1270 ****************************/
1271
1272static int
1273md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1274{
1275 OFF off = beg;
1276 MD_UNUSED(ctx);
1277
1278 while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8)
1279 off++;
1280
1281 if(1 <= off - beg && off - beg <= 6) {
1282 *p_end = off;
1283 return TRUE;
1284 } else {
1285 return FALSE;
1286 }
1287}
1288
1289static int
1290md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1291{
1292 OFF off = beg;
1293 MD_UNUSED(ctx);
1294
1295 while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8)
1296 off++;
1297
1298 if(1 <= off - beg && off - beg <= 7) {
1299 *p_end = off;
1300 return TRUE;
1301 } else {
1302 return FALSE;
1303 }
1304}
1305
1306static int
1307md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1308{
1309 OFF off = beg;
1310 MD_UNUSED(ctx);
1311
1312 if(off < max_end && ISALPHA_(text[off]))
1313 off++;
1314 else
1315 return FALSE;
1316
1317 while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48)
1318 off++;
1319
1320 if(2 <= off - beg && off - beg <= 48) {
1321 *p_end = off;
1322 return TRUE;
1323 } else {
1324 return FALSE;
1325 }
1326}
1327
1328static int
1329md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1330{
1331 int is_contents;
1332 OFF off = beg;
1333
1334 MD_ASSERT(text[off] == _T('&'));
1335 off++;
1336
1337 if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X')))
1338 is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off);
1339 else if(off+1 < max_end && text[off] == _T('#'))
1340 is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off);
1341 else
1342 is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off);
1343
1344 if(is_contents && off < max_end && text[off] == _T(';')) {
1345 *p_end = off+1;
1346 return TRUE;
1347 } else {
1348 return FALSE;
1349 }
1350}
1351
1352static inline int
1353md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1354{
1355 return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end);
1356}
1357
1358
1359/******************************
1360 *** Attribute Management ***
1361 ******************************/
1362
1363typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1364struct MD_ATTRIBUTE_BUILD_tag {
1365 CHAR* text;
1366 MD_TEXTTYPE* substr_types;
1367 OFF* substr_offsets;
1368 int substr_count;
1369 int substr_alloc;
1370 MD_TEXTTYPE trivial_types[1];
1371 OFF trivial_offsets[2];
1372};
1373
1374
1375#define MD_BUILD_ATTR_NO_ESCAPES 0x0001
1376
1377static int
1378md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1379 MD_TEXTTYPE type, OFF off)
1380{
1381 if(build->substr_count >= build->substr_alloc) {
1382 MD_TEXTTYPE* new_substr_types;
1383 OFF* new_substr_offsets;
1384
1385 build->substr_alloc = (build->substr_alloc > 0
1386 ? build->substr_alloc + build->substr_alloc / 2
1387 : 8);
1388 new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types,
1389 build->substr_alloc * sizeof(MD_TEXTTYPE));
1390 if(new_substr_types == NULL) {
1391 MD_LOG("realloc() failed.");
1392 return -1;
1393 }
1394 /* Note +1 to reserve space for final offset (== raw_size). */
1395 new_substr_offsets = (OFF*) realloc(build->substr_offsets,
1396 (build->substr_alloc+1) * sizeof(OFF));
1397 if(new_substr_offsets == NULL) {
1398 MD_LOG("realloc() failed.");
1399 free(new_substr_types);
1400 return -1;
1401 }
1402
1403 build->substr_types = new_substr_types;
1404 build->substr_offsets = new_substr_offsets;
1405 }
1406
1407 build->substr_types[build->substr_count] = type;
1408 build->substr_offsets[build->substr_count] = off;
1409 build->substr_count++;
1410 return 0;
1411}
1412
1413static void
1414md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1415{
1416 MD_UNUSED(ctx);
1417
1418 if(build->substr_alloc > 0) {
1419 free(build->text);
1420 free(build->substr_types);
1421 free(build->substr_offsets);
1422 }
1423}
1424
1425static int
1426md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1427 unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1428{
1429 OFF raw_off, off;
1430 int is_trivial;
1431 int ret = 0;
1432
1433 memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD));
1434
1435 /* If there is no backslash and no ampersand, build trivial attribute
1436 * without any malloc(). */
1437 is_trivial = TRUE;
1438 for(raw_off = 0; raw_off < raw_size; raw_off++) {
1439 if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1440 is_trivial = FALSE;
1441 break;
1442 }
1443 }
1444
1445 if(is_trivial) {
1446 build->text = (CHAR*) (raw_size ? raw_text : NULL);
1447 build->substr_types = build->trivial_types;
1448 build->substr_offsets = build->trivial_offsets;
1449 build->substr_count = 1;
1450 build->substr_alloc = 0;
1451 build->trivial_types[0] = MD_TEXT_NORMAL;
1452 build->trivial_offsets[0] = 0;
1453 build->trivial_offsets[1] = raw_size;
1454 off = raw_size;
1455 } else {
1456 build->text = (CHAR*) malloc(raw_size * sizeof(CHAR));
1457 if(build->text == NULL) {
1458 MD_LOG("malloc() failed.");
1459 goto abort;
1460 }
1461
1462 raw_off = 0;
1463 off = 0;
1464
1465 while(raw_off < raw_size) {
1466 if(raw_text[raw_off] == _T('\0')) {
1467 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1468 memcpy(build->text + off, raw_text + raw_off, 1);
1469 off++;
1470 raw_off++;
1471 continue;
1472 }
1473
1474 if(raw_text[raw_off] == _T('&')) {
1475 OFF ent_end;
1476
1477 if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) {
1478 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1479 memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off);
1480 off += ent_end - raw_off;
1481 raw_off = ent_end;
1482 continue;
1483 }
1484 }
1485
1486 if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1487 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1488
1489 if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) &&
1490 raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size &&
1491 (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1492 raw_off++;
1493
1494 build->text[off++] = raw_text[raw_off++];
1495 }
1496 build->substr_offsets[build->substr_count] = off;
1497 }
1498
1499 attr->text = build->text;
1500 attr->size = off;
1501 attr->substr_offsets = build->substr_offsets;
1502 attr->substr_types = build->substr_types;
1503 return 0;
1504
1505abort:
1506 md_free_attribute(ctx, build);
1507 return -1;
1508}
1509
1510
1511/*********************************************
1512 *** Dictionary of Reference Definitions ***
1513 *********************************************/
1514
1515#define MD_FNV1A_BASE 2166136261U
1516#define MD_FNV1A_PRIME 16777619U
1517
1518static inline unsigned
1519md_fnv1a(unsigned base, const void* data, size_t n)
1520{
1521 const unsigned char* buf = (const unsigned char*) data;
1522 unsigned hash = base;
1523 size_t i;
1524
1525 for(i = 0; i < n; i++) {
1526 hash ^= buf[i];
1527 hash *= MD_FNV1A_PRIME;
1528 }
1529
1530 return hash;
1531}
1532
1533
1534struct MD_REF_DEF_tag {
1535 CHAR* label;
1536 CHAR* title;
1537 unsigned hash;
1538 SZ label_size;
1539 SZ title_size;
1540 OFF dest_beg;
1541 OFF dest_end;
1542 unsigned char label_needs_free : 1;
1543 unsigned char title_needs_free : 1;
1544};
1545
1546/* Label equivalence is quite complicated with regards to whitespace and case
1547 * folding. This complicates computing a hash of it as well as direct comparison
1548 * of two labels. */
1549
1550static unsigned
1551md_link_label_hash(const CHAR* label, SZ size)
1552{
1553 unsigned hash = MD_FNV1A_BASE;
1554 OFF off;
1555 unsigned codepoint;
1556 int is_whitespace = FALSE;
1557
1558 off = md_skip_unicode_whitespace(label, 0, size);
1559 while(off < size) {
1560 SZ char_size;
1561
1562 codepoint = md_decode_unicode(label, off, size, &char_size);
1563 is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1564
1565 if(is_whitespace) {
1566 codepoint = ' ';
1567 hash = md_fnv1a(hash, &codepoint, sizeof(unsigned));
1568 off = md_skip_unicode_whitespace(label, off, size);
1569 } else {
1570 MD_UNICODE_FOLD_INFO fold_info;
1571
1572 md_get_unicode_fold_info(codepoint, &fold_info);
1573 hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned));
1574 off += char_size;
1575 }
1576 }
1577
1578 return hash;
1579}
1580
1581static OFF
1582md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1583 MD_UNICODE_FOLD_INFO* fold_info)
1584{
1585 unsigned codepoint;
1586 SZ char_size;
1587
1588 if(off >= size) {
1589 /* Treat end of a link label as a whitespace. */
1590 goto whitespace;
1591 }
1592
1593 codepoint = md_decode_unicode(label, off, size, &char_size);
1594 off += char_size;
1595 if(ISUNICODEWHITESPACE_(codepoint)) {
1596 /* Treat all whitespace as equivalent */
1597 goto whitespace;
1598 }
1599
1600 /* Get real folding info. */
1601 md_get_unicode_fold_info(codepoint, fold_info);
1602 return off;
1603
1604whitespace:
1605 fold_info->codepoints[0] = _T(' ');
1606 fold_info->n_codepoints = 1;
1607 return md_skip_unicode_whitespace(label, off, size);
1608}
1609
1610static int
1611md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1612{
1613 OFF a_off;
1614 OFF b_off;
1615 MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
1616 MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
1617 OFF a_fi_off = 0;
1618 OFF b_fi_off = 0;
1619 int cmp;
1620
1621 a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
1622 b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
1623 while(a_off < a_size || a_fi_off < a_fi.n_codepoints ||
1624 b_off < b_size || b_fi_off < b_fi.n_codepoints)
1625 {
1626 /* If needed, load fold info for next char. */
1627 if(a_fi_off >= a_fi.n_codepoints) {
1628 a_fi_off = 0;
1629 a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi);
1630 }
1631 if(b_fi_off >= b_fi.n_codepoints) {
1632 b_fi_off = 0;
1633 b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi);
1634 }
1635
1636 cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1637 if(cmp != 0)
1638 return cmp;
1639
1640 a_fi_off++;
1641 b_fi_off++;
1642 }
1643
1644 return 0;
1645}
1646
1647typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1648struct MD_REF_DEF_LIST_tag {
1649 int n_ref_defs;
1650 int alloc_ref_defs;
1651 MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */
1652};
1653
1654static int
1655md_ref_def_cmp(const void* a, const void* b)
1656{
1657 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1658 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1659
1660 if(a_ref->hash < b_ref->hash)
1661 return -1;
1662 else if(a_ref->hash > b_ref->hash)
1663 return +1;
1664 else
1665 return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size);
1666}
1667
1668static int
1669md_ref_def_cmp_for_sort(const void* a, const void* b)
1670{
1671 int cmp;
1672
1673 cmp = md_ref_def_cmp(a, b);
1674
1675 /* Ensure stability of the sorting. */
1676 if(cmp == 0) {
1677 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1678 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1679
1680 if(a_ref < b_ref)
1681 cmp = -1;
1682 else if(a_ref > b_ref)
1683 cmp = +1;
1684 else
1685 cmp = 0;
1686 }
1687
1688 return cmp;
1689}
1690
1691static int
1692md_build_ref_def_hashtable(MD_CTX* ctx)
1693{
1694 int i, j;
1695
1696 if(ctx->n_ref_defs == 0)
1697 return 0;
1698
1699 ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1700 ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*));
1701 if(ctx->ref_def_hashtable == NULL) {
1702 MD_LOG("malloc() failed.");
1703 goto abort;
1704 }
1705 memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*));
1706
1707 /* Each member of ctx->ref_def_hashtable[] can be:
1708 * -- NULL,
1709 * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1710 * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1711 * such MD_REF_DEFs.
1712 */
1713 for(i = 0; i < ctx->n_ref_defs; i++) {
1714 MD_REF_DEF* def = &ctx->ref_defs[i];
1715 void* bucket;
1716 MD_REF_DEF_LIST* list;
1717
1718 def->hash = md_link_label_hash(def->label, def->label_size);
1719 bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1720
1721 if(bucket == NULL) {
1722 /* The bucket is empty. Make it just point to the def. */
1723 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1724 continue;
1725 }
1726
1727 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1728 /* The bucket already contains one ref. def. Lets see whether it
1729 * is the same label (ref. def. duplicate) or different one
1730 * (hash conflict). */
1731 MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1732
1733 if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
1734 /* Duplicate label: Ignore this ref. def. */
1735 continue;
1736 }
1737
1738 /* Make the bucket complex, i.e. able to hold more ref. defs. */
1739 list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1740 if(list == NULL) {
1741 MD_LOG("malloc() failed.");
1742 goto abort;
1743 }
1744 list->ref_defs[0] = old_def;
1745 list->ref_defs[1] = def;
1746 list->n_ref_defs = 2;
1747 list->alloc_ref_defs = 2;
1748 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1749 continue;
1750 }
1751
1752 /* Append the def to the complex bucket list.
1753 *
1754 * Note in this case we ignore potential duplicates to avoid expensive
1755 * iterating over the complex bucket. Below, we revisit all the complex
1756 * buckets and handle it more cheaply after the complex bucket contents
1757 * is sorted. */
1758 list = (MD_REF_DEF_LIST*) bucket;
1759 if(list->n_ref_defs >= list->alloc_ref_defs) {
1760 int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1761 MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
1762 sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1763 if(list_tmp == NULL) {
1764 MD_LOG("realloc() failed.");
1765 goto abort;
1766 }
1767 list = list_tmp;
1768 list->alloc_ref_defs = alloc_ref_defs;
1769 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1770 }
1771
1772 list->ref_defs[list->n_ref_defs] = def;
1773 list->n_ref_defs++;
1774 }
1775
1776 /* Sort the complex buckets so we can use bsearch() with them. */
1777 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1778 void* bucket = ctx->ref_def_hashtable[i];
1779 MD_REF_DEF_LIST* list;
1780
1781 if(bucket == NULL)
1782 continue;
1783 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1784 continue;
1785
1786 list = (MD_REF_DEF_LIST*) bucket;
1787 qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort);
1788
1789 /* Disable all duplicates in the complex bucket by forcing all such
1790 * records to point to the 1st such ref. def. I.e. no matter which
1791 * record is found during the lookup, it will always point to the right
1792 * ref. def. in ctx->ref_defs[]. */
1793 for(j = 1; j < list->n_ref_defs; j++) {
1794 if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0)
1795 list->ref_defs[j] = list->ref_defs[j-1];
1796 }
1797 }
1798
1799 return 0;
1800
1801abort:
1802 return -1;
1803}
1804
1805static void
1806md_free_ref_def_hashtable(MD_CTX* ctx)
1807{
1808 if(ctx->ref_def_hashtable != NULL) {
1809 int i;
1810
1811 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1812 void* bucket = ctx->ref_def_hashtable[i];
1813 if(bucket == NULL)
1814 continue;
1815 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1816 continue;
1817 free(bucket);
1818 }
1819
1820 free(ctx->ref_def_hashtable);
1821 }
1822}
1823
1824static const MD_REF_DEF*
1825md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1826{
1827 unsigned hash;
1828 void* bucket;
1829
1830 if(ctx->ref_def_hashtable_size == 0)
1831 return NULL;
1832
1833 hash = md_link_label_hash(label, label_size);
1834 bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1835
1836 if(bucket == NULL) {
1837 return NULL;
1838 } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1839 const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1840
1841 if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0)
1842 return def;
1843 else
1844 return NULL;
1845 } else {
1846 MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1847 MD_REF_DEF key_buf;
1848 const MD_REF_DEF* key = &key_buf;
1849 const MD_REF_DEF** ret;
1850
1851 key_buf.label = (CHAR*) label;
1852 key_buf.label_size = label_size;
1853 key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size);
1854
1855 ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs,
1856 list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp);
1857 if(ret != NULL)
1858 return *ret;
1859 else
1860 return NULL;
1861 }
1862}
1863
1864
1865/***************************
1866 *** Recognizing Links ***
1867 ***************************/
1868
1869/* Note this code is partially shared between processing inlines and blocks
1870 * as reference definitions and links share some helper parser functions.
1871 */
1872
1873typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1874struct MD_LINK_ATTR_tag {
1875 OFF dest_beg;
1876 OFF dest_end;
1877
1878 CHAR* title;
1879 SZ title_size;
1880 int title_needs_free;
1881};
1882
1883
1884static int
1885md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
1886 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
1887 OFF* p_contents_beg, OFF* p_contents_end)
1888{
1889 OFF off = beg;
1890 OFF contents_beg = 0;
1891 OFF contents_end = 0;
1892 int line_index = 0;
1893 int len = 0;
1894
1895 if(CH(off) != _T('['))
1896 return FALSE;
1897 off++;
1898
1899 while(1) {
1900 OFF line_end = lines[line_index].end;
1901
1902 while(off < line_end) {
1903 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1904 if(contents_end == 0) {
1905 contents_beg = off;
1906 *p_beg_line_index = line_index;
1907 }
1908 contents_end = off + 2;
1909 off += 2;
1910 } else if(CH(off) == _T('[')) {
1911 return FALSE;
1912 } else if(CH(off) == _T(']')) {
1913 if(contents_beg < contents_end) {
1914 /* Success. */
1915 *p_contents_beg = contents_beg;
1916 *p_contents_end = contents_end;
1917 *p_end = off+1;
1918 *p_end_line_index = line_index;
1919 return TRUE;
1920 } else {
1921 /* Link label must have some non-whitespace contents. */
1922 return FALSE;
1923 }
1924 } else {
1925 unsigned codepoint;
1926 SZ char_size;
1927
1928 codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size);
1929 if(!ISUNICODEWHITESPACE_(codepoint)) {
1930 if(contents_end == 0) {
1931 contents_beg = off;
1932 *p_beg_line_index = line_index;
1933 }
1934 contents_end = off + char_size;
1935 }
1936
1937 off += char_size;
1938 }
1939
1940 len++;
1941 if(len > 999)
1942 return FALSE;
1943 }
1944
1945 line_index++;
1946 len++;
1947 if(line_index < n_lines)
1948 off = lines[line_index].beg;
1949 else
1950 break;
1951 }
1952
1953 return FALSE;
1954}
1955
1956static int
1957md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1958 OFF* p_contents_beg, OFF* p_contents_end)
1959{
1960 OFF off = beg;
1961
1962 if(off >= max_end || CH(off) != _T('<'))
1963 return FALSE;
1964 off++;
1965
1966 while(off < max_end) {
1967 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1968 off += 2;
1969 continue;
1970 }
1971
1972 if(ISNEWLINE(off) || CH(off) == _T('<'))
1973 return FALSE;
1974
1975 if(CH(off) == _T('>')) {
1976 /* Success. */
1977 *p_contents_beg = beg+1;
1978 *p_contents_end = off;
1979 *p_end = off+1;
1980 return TRUE;
1981 }
1982
1983 off++;
1984 }
1985
1986 return FALSE;
1987}
1988
1989static int
1990md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1991 OFF* p_contents_beg, OFF* p_contents_end)
1992{
1993 OFF off = beg;
1994 int parenthesis_level = 0;
1995
1996 while(off < max_end) {
1997 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1998 off += 2;
1999 continue;
2000 }
2001
2002 if(ISWHITESPACE(off) || ISCNTRL(off))
2003 break;
2004
2005 /* Link destination may include balanced pairs of unescaped '(' ')'.
2006 * Note we limit the maximal nesting level by 32 to protect us from
2007 * https://github.com/jgm/cmark/issues/214 */
2008 if(CH(off) == _T('(')) {
2009 parenthesis_level++;
2010 if(parenthesis_level > 32)
2011 return FALSE;
2012 } else if(CH(off) == _T(')')) {
2013 if(parenthesis_level == 0)
2014 break;
2015 parenthesis_level--;
2016 }
2017
2018 off++;
2019 }
2020
2021 if(parenthesis_level != 0 || off == beg)
2022 return FALSE;
2023
2024 /* Success. */
2025 *p_contents_beg = beg;
2026 *p_contents_end = off;
2027 *p_end = off;
2028 return TRUE;
2029}
2030
2031static inline int
2032md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2033 OFF* p_contents_beg, OFF* p_contents_end)
2034{
2035 if(CH(beg) == _T('<'))
2036 return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2037 else
2038 return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2039}
2040
2041static int
2042md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2043 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
2044 OFF* p_contents_beg, OFF* p_contents_end)
2045{
2046 OFF off = beg;
2047 CHAR closer_char;
2048 int line_index = 0;
2049
2050 /* White space with up to one line break. */
2051 while(off < lines[line_index].end && ISWHITESPACE(off))
2052 off++;
2053 if(off >= lines[line_index].end) {
2054 line_index++;
2055 if(line_index >= n_lines)
2056 return FALSE;
2057 off = lines[line_index].beg;
2058 }
2059 if(off == beg)
2060 return FALSE;
2061
2062 *p_beg_line_index = line_index;
2063
2064 /* First char determines how to detect end of it. */
2065 switch(CH(off)) {
2066 case _T('"'): closer_char = _T('"'); break;
2067 case _T('\''): closer_char = _T('\''); break;
2068 case _T('('): closer_char = _T(')'); break;
2069 default: return FALSE;
2070 }
2071 off++;
2072
2073 *p_contents_beg = off;
2074
2075 while(line_index < n_lines) {
2076 OFF line_end = lines[line_index].end;
2077
2078 while(off < line_end) {
2079 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2080 off++;
2081 } else if(CH(off) == closer_char) {
2082 /* Success. */
2083 *p_contents_end = off;
2084 *p_end = off+1;
2085 *p_end_line_index = line_index;
2086 return TRUE;
2087 } else if(closer_char == _T(')') && CH(off) == _T('(')) {
2088 /* ()-style title cannot contain (unescaped '(')) */
2089 return FALSE;
2090 }
2091
2092 off++;
2093 }
2094
2095 line_index++;
2096 }
2097
2098 return FALSE;
2099}
2100
2101/* Returns 0 if it is not a reference definition.
2102 *
2103 * Returns N > 0 if it is a reference definition. N then corresponds to the
2104 * number of lines forming it). In this case the definition is stored for
2105 * resolving any links referring to it.
2106 *
2107 * Returns -1 in case of an error (out of memory).
2108 */
2109static int
2110md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
2111{
2112 OFF label_contents_beg;
2113 OFF label_contents_end;
2114 int label_contents_line_index = -1;
2115 int label_is_multiline = FALSE;
2116 OFF dest_contents_beg;
2117 OFF dest_contents_end;
2118 OFF title_contents_beg;
2119 OFF title_contents_end;
2120 int title_contents_line_index;
2121 int title_is_multiline = FALSE;
2122 OFF off;
2123 int line_index = 0;
2124 int tmp_line_index;
2125 MD_REF_DEF* def = NULL;
2126 int ret = 0;
2127
2128 /* Link label. */
2129 if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg,
2130 &off, &label_contents_line_index, &line_index,
2131 &label_contents_beg, &label_contents_end))
2132 return FALSE;
2133 label_is_multiline = (label_contents_line_index != line_index);
2134
2135 /* Colon. */
2136 if(off >= lines[line_index].end || CH(off) != _T(':'))
2137 return FALSE;
2138 off++;
2139
2140 /* Optional white space with up to one line break. */
2141 while(off < lines[line_index].end && ISWHITESPACE(off))
2142 off++;
2143 if(off >= lines[line_index].end) {
2144 line_index++;
2145 if(line_index >= n_lines)
2146 return FALSE;
2147 off = lines[line_index].beg;
2148 }
2149
2150 /* Link destination. */
2151 if(!md_is_link_destination(ctx, off, lines[line_index].end,
2152 &off, &dest_contents_beg, &dest_contents_end))
2153 return FALSE;
2154
2155 /* (Optional) title. Note we interpret it as an title only if nothing
2156 * more follows on its last line. */
2157 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2158 &off, &title_contents_line_index, &tmp_line_index,
2159 &title_contents_beg, &title_contents_end)
2160 && off >= lines[line_index + tmp_line_index].end)
2161 {
2162 title_is_multiline = (tmp_line_index != title_contents_line_index);
2163 title_contents_line_index += line_index;
2164 line_index += tmp_line_index;
2165 } else {
2166 /* Not a title. */
2167 title_is_multiline = FALSE;
2168 title_contents_beg = off;
2169 title_contents_end = off;
2170 title_contents_line_index = 0;
2171 }
2172
2173 /* Nothing more can follow on the last line. */
2174 if(off < lines[line_index].end)
2175 return FALSE;
2176
2177 /* So, it _is_ a reference definition. Remember it. */
2178 if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2179 MD_REF_DEF* new_defs;
2180
2181 ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2182 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2183 : 16);
2184 new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2185 if(new_defs == NULL) {
2186 MD_LOG("realloc() failed.");
2187 goto abort;
2188 }
2189
2190 ctx->ref_defs = new_defs;
2191 }
2192 def = &ctx->ref_defs[ctx->n_ref_defs];
2193 memset(def, 0, sizeof(MD_REF_DEF));
2194
2195 if(label_is_multiline) {
2196 MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2197 lines + label_contents_line_index, n_lines - label_contents_line_index,
2198 _T(' '), &def->label, &def->label_size));
2199 def->label_needs_free = TRUE;
2200 } else {
2201 def->label = (CHAR*) STR(label_contents_beg);
2202 def->label_size = label_contents_end - label_contents_beg;
2203 }
2204
2205 if(title_is_multiline) {
2206 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2207 lines + title_contents_line_index, n_lines - title_contents_line_index,
2208 _T('\n'), &def->title, &def->title_size));
2209 def->title_needs_free = TRUE;
2210 } else {
2211 def->title = (CHAR*) STR(title_contents_beg);
2212 def->title_size = title_contents_end - title_contents_beg;
2213 }
2214
2215 def->dest_beg = dest_contents_beg;
2216 def->dest_end = dest_contents_end;
2217
2218 /* Success. */
2219 ctx->n_ref_defs++;
2220 return line_index + 1;
2221
2222abort:
2223 /* Failure. */
2224 if(def != NULL && def->label_needs_free)
2225 free(def->label);
2226 if(def != NULL && def->title_needs_free)
2227 free(def->title);
2228 return ret;
2229}
2230
2231static int
2232md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2233 OFF beg, OFF end, MD_LINK_ATTR* attr)
2234{
2235 const MD_REF_DEF* def;
2236 const MD_LINE* beg_line;
2237 int is_multiline;
2238 CHAR* label;
2239 SZ label_size;
2240 int ret;
2241
2242 MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2243 MD_ASSERT(CH(end-1) == _T(']'));
2244
2245 beg += (CH(beg) == _T('!') ? 2 : 1);
2246 end--;
2247
2248 /* Find lines corresponding to the beg and end positions. */
2249 beg_line = md_lookup_line(beg, lines, n_lines);
2250 is_multiline = (end > beg_line->end);
2251
2252 if(is_multiline) {
2253 MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2254 (int)(n_lines - (beg_line - lines)), _T(' '), &label, &label_size));
2255 } else {
2256 label = (CHAR*) STR(beg);
2257 label_size = end - beg;
2258 }
2259
2260 def = md_lookup_ref_def(ctx, label, label_size);
2261 if(def != NULL) {
2262 attr->dest_beg = def->dest_beg;
2263 attr->dest_end = def->dest_end;
2264 attr->title = def->title;
2265 attr->title_size = def->title_size;
2266 attr->title_needs_free = FALSE;
2267 }
2268
2269 if(is_multiline)
2270 free(label);
2271
2272 ret = (def != NULL);
2273
2274abort:
2275 return ret;
2276}
2277
2278static int
2279md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2280 OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2281{
2282 int line_index = 0;
2283 int tmp_line_index;
2284 OFF title_contents_beg;
2285 OFF title_contents_end;
2286 int title_contents_line_index;
2287 int title_is_multiline;
2288 OFF off = beg;
2289 int ret = FALSE;
2290
2291 while(off >= lines[line_index].end)
2292 line_index++;
2293
2294 MD_ASSERT(CH(off) == _T('('));
2295 off++;
2296
2297 /* Optional white space with up to one line break. */
2298 while(off < lines[line_index].end && ISWHITESPACE(off))
2299 off++;
2300 if(off >= lines[line_index].end && (off >= ctx->size || ISNEWLINE(off))) {
2301 line_index++;
2302 if(line_index >= n_lines)
2303 return FALSE;
2304 off = lines[line_index].beg;
2305 }
2306
2307 /* Link destination may be omitted, but only when not also having a title. */
2308 if(off < ctx->size && CH(off) == _T(')')) {
2309 attr->dest_beg = off;
2310 attr->dest_end = off;
2311 attr->title = NULL;
2312 attr->title_size = 0;
2313 attr->title_needs_free = FALSE;
2314 off++;
2315 *p_end = off;
2316 return TRUE;
2317 }
2318
2319 /* Link destination. */
2320 if(!md_is_link_destination(ctx, off, lines[line_index].end,
2321 &off, &attr->dest_beg, &attr->dest_end))
2322 return FALSE;
2323
2324 /* (Optional) title. */
2325 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2326 &off, &title_contents_line_index, &tmp_line_index,
2327 &title_contents_beg, &title_contents_end))
2328 {
2329 title_is_multiline = (tmp_line_index != title_contents_line_index);
2330 title_contents_line_index += line_index;
2331 line_index += tmp_line_index;
2332 } else {
2333 /* Not a title. */
2334 title_is_multiline = FALSE;
2335 title_contents_beg = off;
2336 title_contents_end = off;
2337 title_contents_line_index = 0;
2338 }
2339
2340 /* Optional whitespace followed with final ')'. */
2341 while(off < lines[line_index].end && ISWHITESPACE(off))
2342 off++;
2343 if (off >= lines[line_index].end && (off >= ctx->size || ISNEWLINE(off))) {
2344 line_index++;
2345 if(line_index >= n_lines)
2346 return FALSE;
2347 off = lines[line_index].beg;
2348 }
2349 if(CH(off) != _T(')'))
2350 goto abort;
2351 off++;
2352
2353 if(title_contents_beg >= title_contents_end) {
2354 attr->title = NULL;
2355 attr->title_size = 0;
2356 attr->title_needs_free = FALSE;
2357 } else if(!title_is_multiline) {
2358 attr->title = (CHAR*) STR(title_contents_beg);
2359 attr->title_size = title_contents_end - title_contents_beg;
2360 attr->title_needs_free = FALSE;
2361 } else {
2362 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2363 lines + title_contents_line_index, n_lines - title_contents_line_index,
2364 _T('\n'), &attr->title, &attr->title_size));
2365 attr->title_needs_free = TRUE;
2366 }
2367
2368 *p_end = off;
2369 ret = TRUE;
2370
2371abort:
2372 return ret;
2373}
2374
2375static void
2376md_free_ref_defs(MD_CTX* ctx)
2377{
2378 int i;
2379
2380 for(i = 0; i < ctx->n_ref_defs; i++) {
2381 MD_REF_DEF* def = &ctx->ref_defs[i];
2382
2383 if(def->label_needs_free)
2384 free(def->label);
2385 if(def->title_needs_free)
2386 free(def->title);
2387 }
2388
2389 free(ctx->ref_defs);
2390}
2391
2392
2393/******************************************
2394 *** Processing Inlines (a.k.a Spans) ***
2395 ******************************************/
2396
2397/* We process inlines in few phases:
2398 *
2399 * (1) We go through the block text and collect all significant characters
2400 * which may start/end a span or some other significant position into
2401 * ctx->marks[]. Core of this is what md_collect_marks() does.
2402 *
2403 * We also do some very brief preliminary context-less analysis, whether
2404 * it might be opener or closer (e.g. of an emphasis span).
2405 *
2406 * This speeds the other steps as we do not need to re-iterate over all
2407 * characters anymore.
2408 *
2409 * (2) We analyze each potential mark types, in order by their precedence.
2410 *
2411 * In each md_analyze_XXX() function, we re-iterate list of the marks,
2412 * skipping already resolved regions (in preceding precedences) and try to
2413 * resolve them.
2414 *
2415 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2416 * them as resolved.
2417 *
2418 * (2.2) For range-type marks, we analyze whether the mark could be closer
2419 * and, if yes, whether there is some preceding opener it could satisfy.
2420 *
2421 * If not we check whether it could be really an opener and if yes, we
2422 * remember it so subsequent closers may resolve it.
2423 *
2424 * (3) Finally, when all marks were analyzed, we render the block contents
2425 * by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2426 * or ::close_span() whenever we reach a resolved mark.
2427 */
2428
2429
2430/* The mark structure.
2431 *
2432 * '\\': Maybe escape sequence.
2433 * '\0': NULL char.
2434 * '*': Maybe (strong) emphasis start/end.
2435 * '_': Maybe (strong) emphasis start/end.
2436 * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2437 * '`': Maybe code span start/end.
2438 * '&': Maybe start of entity.
2439 * ';': Maybe end of entity.
2440 * '<': Maybe start of raw HTML or autolink.
2441 * '>': Maybe end of raw HTML or autolink.
2442 * '[': Maybe start of link label or link text.
2443 * '!': Equivalent of '[' for image.
2444 * ']': Maybe end of link label or link text.
2445 * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2446 * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2447 * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2448 * 'D': Dummy mark, it reserves a space for splitting a previous mark
2449 * (e.g. emphasis) or to make more space for storing some special data
2450 * related to the preceding mark (e.g. link).
2451 *
2452 * Note that not all instances of these chars in the text imply creation of the
2453 * structure. Only those which have (or may have, after we see more context)
2454 * the special meaning.
2455 *
2456 * (Keep this struct as small as possible to fit as much of them into CPU
2457 * cache line.)
2458 */
2459struct MD_MARK_tag {
2460 OFF beg;
2461 OFF end;
2462
2463 /* For unresolved openers, 'prev' and 'next' form the chain of open openers
2464 * of given type 'ch'.
2465 *
2466 * During resolving, we disconnect from the chain and point to the
2467 * corresponding counterpart so opener points to its closer and vice versa.
2468 */
2469 int prev;
2470 int next;
2471 CHAR ch;
2472 unsigned char flags;
2473};
2474
2475/* Mark flags (these apply to ALL mark types). */
2476#define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */
2477#define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */
2478#define MD_MARK_OPENER 0x04 /* Definitely opener. */
2479#define MD_MARK_CLOSER 0x08 /* Definitely closer. */
2480#define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */
2481
2482/* Mark flags specific for various mark types (so they can share bits). */
2483#define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */
2484#define MD_MARK_EMPH_MOD3_0 0x40
2485#define MD_MARK_EMPH_MOD3_1 0x80
2486#define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80)
2487#define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80)
2488#define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */
2489#define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */
2490#define MD_MARK_HASNESTEDBRACKETS 0x20 /* For '[' to rule out invalid link labels early */
2491
2492static MD_MARKCHAIN*
2493md_asterisk_chain(MD_CTX* ctx, unsigned flags)
2494{
2495 switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
2496 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0;
2497 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1;
2498 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2;
2499 case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0;
2500 case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1;
2501 case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2;
2502 default: MD_UNREACHABLE();
2503 }
2504 return NULL;
2505}
2506
2507static MD_MARKCHAIN*
2508md_mark_chain(MD_CTX* ctx, int mark_index)
2509{
2510 MD_MARK* mark = &ctx->marks[mark_index];
2511
2512 switch(mark->ch) {
2513 case _T('*'): return md_asterisk_chain(ctx, mark->flags);
2514 case _T('_'): return &UNDERSCORE_OPENERS;
2515 case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2516 case _T('!'): MD_FALLTHROUGH();
2517 case _T('['): return &BRACKET_OPENERS;
2518 case _T('|'): return &TABLECELLBOUNDARIES;
2519 default: return NULL;
2520 }
2521}
2522
2523static MD_MARK*
2524md_push_mark(MD_CTX* ctx)
2525{
2526 if(ctx->n_marks >= ctx->alloc_marks) {
2527 MD_MARK* new_marks;
2528
2529 ctx->alloc_marks = (ctx->alloc_marks > 0
2530 ? ctx->alloc_marks + ctx->alloc_marks / 2
2531 : 64);
2532 new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK));
2533 if(new_marks == NULL) {
2534 MD_LOG("realloc() failed.");
2535 return NULL;
2536 }
2537
2538 ctx->marks = new_marks;
2539 }
2540
2541 return &ctx->marks[ctx->n_marks++];
2542}
2543
2544#define PUSH_MARK_() \
2545 do { \
2546 mark = md_push_mark(ctx); \
2547 if(mark == NULL) { \
2548 ret = -1; \
2549 goto abort; \
2550 } \
2551 } while(0)
2552
2553#define PUSH_MARK(ch_, beg_, end_, flags_) \
2554 do { \
2555 PUSH_MARK_(); \
2556 mark->beg = (beg_); \
2557 mark->end = (end_); \
2558 mark->prev = -1; \
2559 mark->next = -1; \
2560 mark->ch = (char)(ch_); \
2561 mark->flags = (flags_); \
2562 } while(0)
2563
2564
2565static void
2566md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index)
2567{
2568 if(chain->tail >= 0)
2569 ctx->marks[chain->tail].next = mark_index;
2570 else
2571 chain->head = mark_index;
2572
2573 ctx->marks[mark_index].prev = chain->tail;
2574 ctx->marks[mark_index].next = -1;
2575 chain->tail = mark_index;
2576}
2577
2578/* Sometimes, we need to store a pointer into the mark. It is quite rare
2579 * so we do not bother to make MD_MARK use union, and it can only happen
2580 * for dummy marks. */
2581static inline void
2582md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2583{
2584 MD_MARK* mark = &ctx->marks[mark_index];
2585 MD_ASSERT(mark->ch == 'D');
2586
2587 /* Check only members beg and end are misused for this. */
2588 MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2589 memcpy(mark, &ptr, sizeof(void*));
2590}
2591
2592static inline void*
2593md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2594{
2595 void* ptr;
2596 MD_MARK* mark = &ctx->marks[mark_index];
2597 MD_ASSERT(mark->ch == 'D');
2598 memcpy(&ptr, mark, sizeof(void*));
2599 return ptr;
2600}
2601
2602static void
2603md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index)
2604{
2605 MD_MARK* opener = &ctx->marks[opener_index];
2606 MD_MARK* closer = &ctx->marks[closer_index];
2607
2608 /* Remove opener from the list of openers. */
2609 if(chain != NULL) {
2610 if(opener->prev >= 0)
2611 ctx->marks[opener->prev].next = opener->next;
2612 else
2613 chain->head = opener->next;
2614
2615 if(opener->next >= 0)
2616 ctx->marks[opener->next].prev = opener->prev;
2617 else
2618 chain->tail = opener->prev;
2619 }
2620
2621 /* Interconnect opener and closer and mark both as resolved. */
2622 opener->next = closer_index;
2623 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2624 closer->prev = opener_index;
2625 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2626}
2627
2628
2629#define MD_ROLLBACK_ALL 0
2630#define MD_ROLLBACK_CROSSING 1
2631
2632/* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2633 * resolvings accordingly to these rules:
2634 *
2635 * (1) All openers BEFORE the range corresponding to any closer inside the
2636 * range are un-resolved and they are re-added to their respective chains
2637 * of unresolved openers. This ensures we can reuse the opener for closers
2638 * AFTER the range.
2639 *
2640 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2641 * are discarded.
2642 *
2643 * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled
2644 * in (1) are discarded. I.e. pairs of openers and closers which are both
2645 * inside the range are retained as well as any unpaired marks.
2646 */
2647static void
2648md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2649{
2650 int i;
2651 int mark_index;
2652
2653 /* Cut all unresolved openers at the mark index. */
2654 for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) {
2655 MD_MARKCHAIN* chain = &ctx->mark_chains[i];
2656
2657 while(chain->tail >= opener_index) {
2658 int same = chain->tail == opener_index;
2659 chain->tail = ctx->marks[chain->tail].prev;
2660 if (same) break;
2661 }
2662
2663 if(chain->tail >= 0)
2664 ctx->marks[chain->tail].next = -1;
2665 else
2666 chain->head = -1;
2667 }
2668
2669 /* Go backwards so that unresolved openers are re-added into their
2670 * respective chains, in the right order. */
2671 mark_index = closer_index - 1;
2672 while(mark_index > opener_index) {
2673 MD_MARK* mark = &ctx->marks[mark_index];
2674 int mark_flags = mark->flags;
2675 int discard_flag = (how == MD_ROLLBACK_ALL);
2676
2677 if(mark->flags & MD_MARK_CLOSER) {
2678 int mark_opener_index = mark->prev;
2679
2680 /* Undo opener BEFORE the range. */
2681 if(mark_opener_index < opener_index) {
2682 MD_MARK* mark_opener = &ctx->marks[mark_opener_index];
2683 MD_MARKCHAIN* chain;
2684
2685 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2686 chain = md_mark_chain(ctx, opener_index);
2687 if(chain != NULL) {
2688 md_mark_chain_append(ctx, chain, mark_opener_index);
2689 discard_flag = 1;
2690 }
2691 }
2692 }
2693
2694 /* And reset our flags. */
2695 if(discard_flag) {
2696 /* Make zero-length closer a dummy mark as that's how it was born */
2697 if((mark->flags & MD_MARK_CLOSER) && mark->beg == mark->end)
2698 mark->ch = 'D';
2699
2700 mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2701 }
2702
2703 /* Jump as far as we can over unresolved or non-interesting marks. */
2704 switch(how) {
2705 case MD_ROLLBACK_CROSSING:
2706 if((mark_flags & MD_MARK_CLOSER) && mark->prev > opener_index) {
2707 /* If we are closer with opener INSIDE the range, there may
2708 * not be any other crosser inside the subrange. */
2709 mark_index = mark->prev;
2710 break;
2711 }
2712 MD_FALLTHROUGH();
2713 default:
2714 mark_index--;
2715 break;
2716 }
2717 }
2718}
2719
2720static void
2721md_build_mark_char_map(MD_CTX* ctx)
2722{
2723 memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
2724
2725 ctx->mark_char_map['\\'] = 1;
2726 ctx->mark_char_map['*'] = 1;
2727 ctx->mark_char_map['_'] = 1;
2728 ctx->mark_char_map['`'] = 1;
2729 ctx->mark_char_map['&'] = 1;
2730 ctx->mark_char_map[';'] = 1;
2731 ctx->mark_char_map['<'] = 1;
2732 ctx->mark_char_map['>'] = 1;
2733 ctx->mark_char_map['['] = 1;
2734 ctx->mark_char_map['!'] = 1;
2735 ctx->mark_char_map[']'] = 1;
2736 ctx->mark_char_map['\0'] = 1;
2737
2738 if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2739 ctx->mark_char_map['~'] = 1;
2740
2741 if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2742 ctx->mark_char_map['$'] = 1;
2743
2744 if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2745 ctx->mark_char_map['@'] = 1;
2746
2747 if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2748 ctx->mark_char_map[':'] = 1;
2749
2750 if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2751 ctx->mark_char_map['.'] = 1;
2752
2753 if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2754 ctx->mark_char_map['|'] = 1;
2755
2756 if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2757 int i;
2758
2759 for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2760 if(ISWHITESPACE_(i))
2761 ctx->mark_char_map[i] = 1;
2762 }
2763 }
2764}
2765
2766/* We limit code span marks to lower than 32 backticks. This solves the
2767 * pathologic case of too many openers, each of different length: Their
2768 * resolving would be then O(n^2). */
2769#define CODESPAN_MARK_MAXLEN 32
2770
2771static int
2772md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2773 OFF* p_opener_beg, OFF* p_opener_end,
2774 OFF* p_closer_beg, OFF* p_closer_end,
2775 OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2776 int* p_reached_paragraph_end)
2777{
2778 OFF opener_beg = beg;
2779 OFF opener_end;
2780 OFF closer_beg;
2781 OFF closer_end;
2782 SZ mark_len;
2783 OFF line_end;
2784 int has_space_after_opener = FALSE;
2785 int has_eol_after_opener = FALSE;
2786 int has_space_before_closer = FALSE;
2787 int has_eol_before_closer = FALSE;
2788 int has_only_space = TRUE;
2789 int line_index = 0;
2790
2791 line_end = lines[0].end;
2792 opener_end = opener_beg;
2793 while(opener_end < line_end && CH(opener_end) == _T('`'))
2794 opener_end++;
2795 has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2796 has_eol_after_opener = (opener_end == line_end);
2797
2798 /* The caller needs to know end of the opening mark even if we fail. */
2799 *p_opener_end = opener_end;
2800
2801 mark_len = opener_end - opener_beg;
2802 if(mark_len > CODESPAN_MARK_MAXLEN)
2803 return FALSE;
2804
2805 /* Check whether we already know there is no closer of this length.
2806 * If so, re-scan does no sense. This fixes issue #59. */
2807 if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end ||
2808 (*p_reached_paragraph_end && last_potential_closers[mark_len-1] < opener_end))
2809 return FALSE;
2810
2811 closer_beg = opener_end;
2812 closer_end = opener_end;
2813
2814 /* Find closer mark. */
2815 while(TRUE) {
2816 while(closer_beg < line_end && CH(closer_beg) != _T('`')) {
2817 if(CH(closer_beg) != _T(' '))
2818 has_only_space = FALSE;
2819 closer_beg++;
2820 }
2821 closer_end = closer_beg;
2822 while(closer_end < line_end && CH(closer_end) == _T('`'))
2823 closer_end++;
2824
2825 if(closer_end - closer_beg == mark_len) {
2826 /* Success. */
2827 has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2828 has_eol_before_closer = (closer_beg == lines[line_index].beg);
2829 break;
2830 }
2831
2832 if(closer_end - closer_beg > 0) {
2833 /* We have found a back-tick which is not part of the closer. */
2834 has_only_space = FALSE;
2835
2836 /* But if we eventually fail, remember it as a potential closer
2837 * of its own length for future attempts. This mitigates needs for
2838 * rescans. */
2839 if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2840 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2841 last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2842 }
2843 }
2844
2845 if(closer_end >= line_end) {
2846 line_index++;
2847 if(line_index >= n_lines) {
2848 /* Reached end of the paragraph and still nothing. */
2849 *p_reached_paragraph_end = TRUE;
2850 return FALSE;
2851 }
2852 /* Try on the next line. */
2853 line_end = lines[line_index].end;
2854 closer_beg = lines[line_index].beg;
2855 } else {
2856 closer_beg = closer_end;
2857 }
2858 }
2859
2860 /* If there is a space or a new line both after and before the opener
2861 * (and if the code span is not made of spaces only), consume one initial
2862 * and one trailing space as part of the marks. */
2863 if(!has_only_space &&
2864 (has_space_after_opener || has_eol_after_opener) &&
2865 (has_space_before_closer || has_eol_before_closer))
2866 {
2867 if(has_space_after_opener)
2868 opener_end++;
2869 else
2870 opener_end = lines[1].beg;
2871
2872 if(has_space_before_closer)
2873 closer_beg--;
2874 else {
2875 closer_beg = lines[line_index-1].end;
2876 /* We need to eat the preceding "\r\n" but not any line trailing
2877 * spaces. */
2878 while(closer_beg < ctx->size && ISBLANK(closer_beg))
2879 closer_beg++;
2880 }
2881 }
2882
2883 *p_opener_beg = opener_beg;
2884 *p_opener_end = opener_end;
2885 *p_closer_beg = closer_beg;
2886 *p_closer_end = closer_end;
2887 return TRUE;
2888}
2889
2890static int
2891md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2892{
2893 OFF off = beg+1;
2894
2895 MD_ASSERT(CH(beg) == _T('<'));
2896
2897 /* Check for scheme. */
2898 if(off >= max_end || !ISASCII(off))
2899 return FALSE;
2900 off++;
2901 while(1) {
2902 if(off >= max_end)
2903 return FALSE;
2904 if(off - beg > 32)
2905 return FALSE;
2906 if(CH(off) == _T(':') && off - beg >= 3)
2907 break;
2908 if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2909 return FALSE;
2910 off++;
2911 }
2912
2913 /* Check the path after the scheme. */
2914 while(off < max_end && CH(off) != _T('>')) {
2915 if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2916 return FALSE;
2917 off++;
2918 }
2919
2920 if(off >= max_end)
2921 return FALSE;
2922
2923 MD_ASSERT(CH(off) == _T('>'));
2924 *p_end = off+1;
2925 return TRUE;
2926}
2927
2928static int
2929md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2930{
2931 OFF off = beg + 1;
2932 int label_len;
2933
2934 MD_ASSERT(CH(beg) == _T('<'));
2935
2936 /* The code should correspond to this regexp:
2937 /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2938 @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2939 (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2940 */
2941
2942 /* Username (before '@'). */
2943 while(off < max_end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2944 off++;
2945 if(off <= beg+1)
2946 return FALSE;
2947
2948 /* '@' */
2949 if(off >= max_end || CH(off) != _T('@'))
2950 return FALSE;
2951 off++;
2952
2953 /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2954 * characters or '-', but '-' is not allowed as first or last char. */
2955 label_len = 0;
2956 while(off < max_end) {
2957 if(ISALNUM(off))
2958 label_len++;
2959 else if(CH(off) == _T('-') && label_len > 0)
2960 label_len++;
2961 else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-'))
2962 label_len = 0;
2963 else
2964 break;
2965
2966 if(label_len > 63)
2967 return FALSE;
2968
2969 off++;
2970 }
2971
2972 if(label_len <= 0 || off >= max_end || CH(off) != _T('>') || CH(off-1) == _T('-'))
2973 return FALSE;
2974
2975 *p_end = off+1;
2976 return TRUE;
2977}
2978
2979static int
2980md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2981{
2982 if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2983 *p_missing_mailto = FALSE;
2984 return TRUE;
2985 }
2986
2987 if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2988 *p_missing_mailto = TRUE;
2989 return TRUE;
2990 }
2991
2992 return FALSE;
2993}
2994
2995static int
2996md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
2997{
2998 const MD_LINE* line_term = lines + n_lines;
2999 const MD_LINE* line;
3000 int ret = 0;
3001 MD_MARK* mark;
3002 OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
3003 int codespan_scanned_till_paragraph_end = FALSE;
3004
3005 for(line = lines; line < line_term; line++) {
3006 OFF off = line->beg;
3007 OFF line_end = line->end;
3008
3009 while(TRUE) {
3010 CHAR ch;
3011
3012#ifdef MD4C_USE_UTF16
3013 /* For UTF-16, mark_char_map[] covers only ASCII. */
3014 #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \
3015 (ctx->mark_char_map[(unsigned char) CH(off)]))
3016#else
3017 /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
3018 #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)])
3019#endif
3020
3021 /* Optimization: Use some loop unrolling. */
3022 while(off + 3 < line_end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1)
3023 && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3))
3024 off += 4;
3025 while(off < line_end && !IS_MARK_CHAR(off+0))
3026 off++;
3027
3028 if(off >= line_end)
3029 break;
3030
3031 ch = CH(off);
3032
3033 /* A backslash escape.
3034 * It can go beyond line->end as it may involve escaped new
3035 * line to form a hard break. */
3036 if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
3037 /* Hard-break cannot be on the last line of the block. */
3038 if(!ISNEWLINE(off+1) || line+1 < line_term)
3039 PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3040 off += 2;
3041 continue;
3042 }
3043
3044 /* A potential (string) emphasis start/end. */
3045 if(ch == _T('*') || ch == _T('_')) {
3046 OFF tmp = off+1;
3047 int left_level; /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3048 int right_level; /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3049
3050 while(tmp < line_end && CH(tmp) == ch)
3051 tmp++;
3052
3053 if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off))
3054 left_level = 0;
3055 else if(ISUNICODEPUNCTBEFORE(off))
3056 left_level = 1;
3057 else
3058 left_level = 2;
3059
3060 if(tmp == line_end || ISUNICODEWHITESPACE(tmp))
3061 right_level = 0;
3062 else if(ISUNICODEPUNCT(tmp))
3063 right_level = 1;
3064 else
3065 right_level = 2;
3066
3067 /* Intra-word underscore doesn't have special meaning. */
3068 if(ch == _T('_') && left_level == 2 && right_level == 2) {
3069 left_level = 0;
3070 right_level = 0;
3071 }
3072
3073 if(left_level != 0 || right_level != 0) {
3074 unsigned flags = 0;
3075
3076 if(left_level > 0 && left_level >= right_level)
3077 flags |= MD_MARK_POTENTIAL_CLOSER;
3078 if(right_level > 0 && right_level >= left_level)
3079 flags |= MD_MARK_POTENTIAL_OPENER;
3080 if(left_level == 2 && right_level == 2)
3081 flags |= MD_MARK_EMPH_INTRAWORD;
3082
3083 /* For "the rule of three" we need to remember the original
3084 * size of the mark (modulo three), before we potentially
3085 * split the mark when being later resolved partially by some
3086 * shorter closer. */
3087 switch((tmp - off) % 3) {
3088 case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3089 case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3090 case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3091 }
3092
3093 PUSH_MARK(ch, off, tmp, flags);
3094
3095 /* During resolving, multiple asterisks may have to be
3096 * split into independent span start/ends. Consider e.g.
3097 * "**foo* bar*". Therefore we push also some empty dummy
3098 * marks to have enough space for that. */
3099 off++;
3100 while(off < tmp) {
3101 PUSH_MARK('D', off, off, 0);
3102 off++;
3103 }
3104 continue;
3105 }
3106
3107 off = tmp;
3108 continue;
3109 }
3110
3111 /* A potential code span start/end. */
3112 if(ch == _T('`')) {
3113 OFF opener_beg, opener_end;
3114 OFF closer_beg, closer_end;
3115 int is_code_span;
3116
3117 is_code_span = md_is_code_span(ctx, line, line_term - line, off,
3118 &opener_beg, &opener_end, &closer_beg, &closer_end,
3119 codespan_last_potential_closers,
3120 &codespan_scanned_till_paragraph_end);
3121 if(is_code_span) {
3122 PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED);
3123 PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3124 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3125 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3126
3127 off = closer_end;
3128
3129 /* Advance the current line accordingly. */
3130 if(off > line_end) {
3131 line = md_lookup_line(off, line, line_term - line);
3132 if(NULL == line) exit(15);
3133 line_end = line->end;
3134 }
3135 continue;
3136 }
3137
3138 off = opener_end;
3139 continue;
3140 }
3141
3142 /* A potential entity start. */
3143 if(ch == _T('&')) {
3144 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3145 off++;
3146 continue;
3147 }
3148
3149 /* A potential entity end. */
3150 if(ch == _T(';')) {
3151 /* We surely cannot be entity unless the previous mark is '&'. */
3152 if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&'))
3153 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3154
3155 off++;
3156 continue;
3157 }
3158
3159 /* A potential autolink or raw HTML start/end. */
3160 if(ch == _T('<')) {
3161 int is_autolink;
3162 OFF autolink_end;
3163 int missing_mailto;
3164
3165 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3166 int is_html;
3167 OFF html_end;
3168
3169 /* Given the nature of the raw HTML, we have to recognize
3170 * it here. Doing so later in md_analyze_lt_gt() could
3171 * open can of worms of quadratic complexity. */
3172 is_html = md_is_html_any(ctx, line, line_term - line, off,
3173 lines[n_lines-1].end, &html_end);
3174 if(is_html) {
3175 PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3176 PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3177 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3178 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3179 off = html_end;
3180
3181 /* Advance the current line accordingly. */
3182 if(off > line_end) {
3183 line = md_lookup_line(off, line, line_term - line);
3184 line_end = line->end;
3185 }
3186 continue;
3187 }
3188 }
3189
3190 is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end,
3191 &autolink_end, &missing_mailto);
3192 if(is_autolink) {
3193 PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1,
3194 MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3195 PUSH_MARK(_T('>'), autolink_end-1, autolink_end,
3196 MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3197 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3198 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3199 off = autolink_end;
3200 continue;
3201 }
3202
3203 off++;
3204 continue;
3205 }
3206
3207 /* A potential link or its part. */
3208 if(ch == _T('[') || (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
3209 OFF tmp = (ch == _T('[') ? off+1 : off+2);
3210 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3211 off = tmp;
3212 /* Two dummies to make enough place for data we need if it is
3213 * a link. */
3214 PUSH_MARK('D', off, off, 0);
3215 PUSH_MARK('D', off, off, 0);
3216 continue;
3217 }
3218 if(ch == _T(']')) {
3219 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3220 off++;
3221 continue;
3222 }
3223
3224 /* A potential permissive e-mail autolink. */
3225 if(ch == _T('@')) {
3226 if(line->beg + 1 <= off && ISALNUM(off-1) &&
3227 off + 3 < line->end && ISALNUM(off+1))
3228 {
3229 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3230 /* Push a dummy as a reserve for a closer. */
3231 PUSH_MARK('D', off, off, 0);
3232 }
3233
3234 off++;
3235 continue;
3236 }
3237
3238 /* A potential permissive URL autolink. */
3239 if(ch == _T(':')) {
3240 static struct {
3241 const CHAR* scheme;
3242 SZ scheme_size;
3243 const CHAR* suffix;
3244 SZ suffix_size;
3245 } scheme_map[] = {
3246 /* In the order from the most frequently used, arguably. */
3247 { _T("http"), 4, _T("//"), 2 },
3248 { _T("https"), 5, _T("//"), 2 },
3249 { _T("ftp"), 3, _T("//"), 2 }
3250 };
3251 int scheme_index;
3252
3253 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3254 const CHAR* scheme = scheme_map[scheme_index].scheme;
3255 const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3256 const CHAR* suffix = scheme_map[scheme_index].suffix;
3257 const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3258
3259 if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), scheme, scheme_size) &&
3260 (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) &&
3261 off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), suffix, suffix_size))
3262 {
3263 PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3264 /* Push a dummy as a reserve for a closer. */
3265 PUSH_MARK('D', off, off, 0);
3266 off += 1 + suffix_size;
3267 break;
3268 }
3269 }
3270
3271 off++;
3272 continue;
3273 }
3274
3275 /* A potential permissive WWW autolink. */
3276 if(ch == _T('.')) {
3277 if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3) &&
3278 (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) &&
3279 off + 1 < line_end)
3280 {
3281 PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3282 /* Push a dummy as a reserve for a closer. */
3283 PUSH_MARK('D', off, off, 0);
3284 off++;
3285 continue;
3286 }
3287
3288 off++;
3289 continue;
3290 }
3291
3292 /* A potential table cell boundary or wiki link label delimiter. */
3293 if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3294 PUSH_MARK(ch, off, off+1, 0);
3295 off++;
3296 continue;
3297 }
3298
3299 /* A potential strikethrough start/end. */
3300 if(ch == _T('~')) {
3301 OFF tmp = off+1;
3302
3303 while(tmp < line_end && CH(tmp) == _T('~'))
3304 tmp++;
3305
3306 if(tmp - off < 3) {
3307 unsigned flags = 0;
3308
3309 if(tmp < line_end && !ISUNICODEWHITESPACE(tmp))
3310 flags |= MD_MARK_POTENTIAL_OPENER;
3311 if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off))
3312 flags |= MD_MARK_POTENTIAL_CLOSER;
3313 if(flags != 0)
3314 PUSH_MARK(ch, off, tmp, flags);
3315 }
3316
3317 off = tmp;
3318 continue;
3319 }
3320
3321 /* A potential equation start/end */
3322 if(ch == _T('$')) {
3323 /* We can have at most two consecutive $ signs,
3324 * where two dollar signs signify a display equation. */
3325 OFF tmp = off+1;
3326
3327 while(tmp < line_end && CH(tmp) == _T('$'))
3328 tmp++;
3329
3330 if (tmp - off <= 2)
3331 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
3332 off = tmp;
3333 continue;
3334 }
3335
3336 /* Turn non-trivial whitespace into single space. */
3337 if(ISWHITESPACE_(ch)) {
3338 OFF tmp = off+1;
3339
3340 while(tmp < line_end && ISWHITESPACE(tmp))
3341 tmp++;
3342
3343 if(tmp - off > 1 || ch != _T(' '))
3344 PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3345
3346 off = tmp;
3347 continue;
3348 }
3349
3350 /* NULL character. */
3351 if(ch == _T('\0')) {
3352 PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3353 off++;
3354 continue;
3355 }
3356
3357 off++;
3358 }
3359 }
3360
3361 /* Add a dummy mark at the end of the mark vector to simplify
3362 * process_inlines(). */
3363 PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3364
3365abort:
3366 return ret;
3367}
3368
3369static void
3370md_analyze_bracket(MD_CTX* ctx, int mark_index)
3371{
3372 /* We cannot really resolve links here as for that we would need
3373 * more context. E.g. a following pair of brackets (reference link),
3374 * or enclosing pair of brackets (if the inner is the link, the outer
3375 * one cannot be.)
3376 *
3377 * Therefore we here only construct a list of '[' ']' pairs ordered by
3378 * position of the closer. This allows us to analyze what is or is not
3379 * link in the right order, from inside to outside in case of nested
3380 * brackets.
3381 *
3382 * The resolving itself is deferred to md_resolve_links().
3383 */
3384
3385 MD_MARK* mark = &ctx->marks[mark_index];
3386
3387 if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3388 if(BRACKET_OPENERS.head != -1)
3389 ctx->marks[BRACKET_OPENERS.tail].flags |= MD_MARK_HASNESTEDBRACKETS;
3390
3391 md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index);
3392 return;
3393 }
3394
3395 if(BRACKET_OPENERS.tail >= 0) {
3396 /* Pop the opener from the chain. */
3397 int opener_index = BRACKET_OPENERS.tail;
3398 MD_MARK* opener = &ctx->marks[opener_index];
3399 if(opener->prev >= 0)
3400 ctx->marks[opener->prev].next = -1;
3401 else
3402 BRACKET_OPENERS.head = -1;
3403 BRACKET_OPENERS.tail = opener->prev;
3404
3405 /* Interconnect the opener and closer. */
3406 opener->next = mark_index;
3407 mark->prev = opener_index;
3408
3409 /* Add the pair into chain of potential links for md_resolve_links().
3410 * Note we misuse opener->prev for this as opener->next points to its
3411 * closer. */
3412 if(ctx->unresolved_link_tail >= 0)
3413 ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3414 else
3415 ctx->unresolved_link_head = opener_index;
3416 ctx->unresolved_link_tail = opener_index;
3417 opener->prev = -1;
3418 }
3419}
3420
3421/* Forward declaration. */
3422static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3423 int mark_beg, int mark_end);
3424
3425static int
3426md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
3427{
3428 int opener_index = ctx->unresolved_link_head;
3429 OFF last_link_beg = 0;
3430 OFF last_link_end = 0;
3431 OFF last_img_beg = 0;
3432 OFF last_img_end = 0;
3433
3434 while(opener_index >= 0) {
3435 MD_MARK* opener = &ctx->marks[opener_index];
3436 int closer_index = opener->next;
3437 MD_MARK* closer = &ctx->marks[closer_index];
3438 int next_index = opener->prev;
3439 MD_MARK* next_opener;
3440 MD_MARK* next_closer;
3441 MD_LINK_ATTR attr;
3442 int is_link = FALSE;
3443
3444 if(next_index >= 0) {
3445 next_opener = &ctx->marks[next_index];
3446 next_closer = &ctx->marks[next_opener->next];
3447 } else {
3448 next_opener = NULL;
3449 next_closer = NULL;
3450 }
3451
3452 /* If nested ("[ [ ] ]"), we need to make sure that:
3453 * - The outer does not end inside of (...) belonging to the inner.
3454 * - The outer cannot be link if the inner is link (i.e. not image).
3455 *
3456 * (Note we here analyze from inner to outer as the marks are ordered
3457 * by closer->beg.)
3458 */
3459 if((opener->beg < last_link_beg && closer->end < last_link_end) ||
3460 (opener->beg < last_img_beg && closer->end < last_img_end) ||
3461 (opener->beg < last_link_end && opener->ch == '['))
3462 {
3463 opener_index = next_index;
3464 continue;
3465 }
3466
3467 /* Recognize and resolve wiki links.
3468 * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3469 */
3470 if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3471 (opener->end - opener->beg == 1) && /* not image */
3472 next_opener != NULL && /* double '[' opener */
3473 next_opener->ch == '[' &&
3474 (next_opener->beg == opener->beg - 1) &&
3475 (next_opener->end - next_opener->beg == 1) &&
3476 next_closer != NULL && /* double ']' closer */
3477 next_closer->ch == ']' &&
3478 (next_closer->beg == closer->beg + 1) &&
3479 (next_closer->end - next_closer->beg == 1))
3480 {
3481 MD_MARK* delim = NULL;
3482 int delim_index;
3483 OFF dest_beg, dest_end;
3484
3485 is_link = TRUE;
3486
3487 /* We don'