sfeed.c (29123B) - raw
1 #include <ctype.h> 2 #include <errno.h> 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <string.h> 7 #include <strings.h> 8 9 #include "util.h" 10 #include "xml.h" 11 12 #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag)) 13 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag) 14 15 /* these feed fields support multiple separated values */ 16 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory) 17 18 /* string and byte-length */ 19 #define STRP(s) s,sizeof(s)-1 20 21 enum FeedType { 22 FeedTypeNone = 0, 23 FeedTypeRSS = 1, 24 FeedTypeAtom = 2 25 }; 26 27 enum ContentType { 28 ContentTypeNone = 0, 29 ContentTypePlain = 1, 30 ContentTypeHTML = 2 31 }; 32 static const char *contenttypes[] = { "", "plain", "html" }; 33 34 /* String data / memory pool */ 35 typedef struct string { 36 char *data; /* data */ 37 size_t len; /* string length */ 38 size_t bufsiz; /* allocated size */ 39 } String; 40 41 /* NOTE: the order of these fields (content, date, author) indicate the 42 * priority to use them, from least important to high. */ 43 enum TagId { 44 TagUnknown = 0, 45 /* RSS */ 46 RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */ 47 RSSTagTitle, 48 RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded, 49 RSSTagGuid, 50 RSSTagGuidPermalinkFalse, 51 RSSTagGuidPermalinkTrue, 52 /* must be defined after GUID, because it can be a link (isPermaLink) */ 53 RSSTagLink, 54 RSSTagEnclosure, 55 RSSTagAuthor, RSSTagDccreator, 56 RSSTagCategory, 57 /* Atom */ 58 /* creation date has higher priority */ 59 AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished, 60 AtomTagTitle, 61 AtomTagMediaDescription, AtomTagSummary, AtomTagContent, 62 AtomTagId, 63 AtomTagLink, 64 AtomTagLinkAlternate, 65 AtomTagLinkEnclosure, 66 AtomTagAuthor, AtomTagAuthorName, 67 AtomTagCategory, 68 TagLast 69 }; 70 71 typedef struct feedtag { 72 char *name; /* name of tag to match */ 73 size_t len; /* len of `name` */ 74 enum TagId id; /* unique ID */ 75 } FeedTag; 76 77 typedef struct field { 78 String str; 79 enum TagId tagid; /* tagid set previously, used for tag priority */ 80 } FeedField; 81 82 enum { 83 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent, 84 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory, 85 FeedFieldLast 86 }; 87 88 typedef struct feedcontext { 89 String *field; /* current FeedItem field String */ 90 FeedField fields[FeedFieldLast]; /* data for current item */ 91 FeedTag tag; /* unique current parsed tag */ 92 int iscontent; /* in content data */ 93 int iscontenttag; /* in content tag */ 94 enum ContentType contenttype; /* content-type for item */ 95 enum FeedType feedtype; 96 int attrcount; /* count item HTML element attributes */ 97 } FeedContext; 98 99 static long long datetounix(long long, int, int, int, int, int); 100 static FeedTag * gettag(enum FeedType, const char *, size_t); 101 static long gettzoffset(const char *); 102 static int isattr(const char *, size_t, const char *, size_t); 103 static int istag(const char *, size_t, const char *, size_t); 104 static int parsetime(const char *, long long *); 105 static void printfields(void); 106 static void string_append(String *, const char *, size_t); 107 static void string_buffer_realloc(String *, size_t); 108 static void string_clear(String *); 109 static void string_print_encoded(String *); 110 static void string_print_timestamp(String *); 111 static void string_print_trimmed(String *); 112 static void string_print_trimmed_multi(String *); 113 static void string_print_uri(String *); 114 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t, 115 const char *, size_t); 116 static void xmlattrentity(XMLParser *, const char *, size_t, const char *, 117 size_t, const char *, size_t); 118 static void xmlattrend(XMLParser *, const char *, size_t, const char *, 119 size_t); 120 static void xmlattrstart(XMLParser *, const char *, size_t, const char *, 121 size_t); 122 static void xmldata(XMLParser *, const char *, size_t); 123 static void xmldataentity(XMLParser *, const char *, size_t); 124 static void xmltagend(XMLParser *, const char *, size_t, int); 125 static void xmltagstart(XMLParser *, const char *, size_t); 126 static void xmltagstartparsed(XMLParser *, const char *, size_t, int); 127 128 /* map tag name to TagId type */ 129 /* RSS, must be alphabetical order */ 130 static FeedTag rsstags[] = { 131 { STRP("author"), RSSTagAuthor }, 132 { STRP("category"), RSSTagCategory }, 133 { STRP("content:encoded"), RSSTagContentEncoded }, 134 { STRP("dc:creator"), RSSTagDccreator }, 135 { STRP("dc:date"), RSSTagDcdate }, 136 { STRP("description"), RSSTagDescription }, 137 /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */ 138 { STRP("enclosure"), RSSTagEnclosure }, 139 { STRP("guid"), RSSTagGuid }, 140 { STRP("link"), RSSTagLink }, 141 { STRP("media:description"), RSSTagMediaDescription }, 142 { STRP("pubdate"), RSSTagPubdate }, 143 { STRP("title"), RSSTagTitle } 144 }; 145 146 /* Atom, must be alphabetical order */ 147 static FeedTag atomtags[] = { 148 { STRP("author"), AtomTagAuthor }, 149 { STRP("category"), AtomTagCategory }, 150 { STRP("content"), AtomTagContent }, 151 { STRP("id"), AtomTagId }, 152 { STRP("issued"), AtomTagIssued }, /* Atom 0.3 */ 153 /* Atom: <link href="" />, RSS has <link></link> */ 154 { STRP("link"), AtomTagLink }, 155 { STRP("media:description"), AtomTagMediaDescription }, 156 { STRP("modified"), AtomTagModified }, /* Atom 0.3 */ 157 { STRP("published"), AtomTagPublished }, 158 { STRP("summary"), AtomTagSummary }, 159 { STRP("title"), AtomTagTitle }, 160 { STRP("updated"), AtomTagUpdated } 161 }; 162 163 /* special case: nested <author><name> */ 164 static FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor }; 165 static FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName }; 166 167 /* reference to no / unknown tag */ 168 static FeedTag notag = { STRP(""), TagUnknown }; 169 170 /* map TagId type to RSS/Atom field, all tags must be defined */ 171 static int fieldmap[TagLast] = { 172 [TagUnknown] = -1, 173 /* RSS */ 174 [RSSTagDcdate] = FeedFieldTime, 175 [RSSTagPubdate] = FeedFieldTime, 176 [RSSTagTitle] = FeedFieldTitle, 177 [RSSTagMediaDescription] = FeedFieldContent, 178 [RSSTagDescription] = FeedFieldContent, 179 [RSSTagContentEncoded] = FeedFieldContent, 180 [RSSTagGuid] = -1, 181 [RSSTagGuidPermalinkFalse] = FeedFieldId, 182 [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both a link and an id */ 183 [RSSTagLink] = FeedFieldLink, 184 [RSSTagEnclosure] = FeedFieldEnclosure, 185 [RSSTagAuthor] = FeedFieldAuthor, 186 [RSSTagDccreator] = FeedFieldAuthor, 187 [RSSTagCategory] = FeedFieldCategory, 188 /* Atom */ 189 [AtomTagModified] = FeedFieldTime, 190 [AtomTagUpdated] = FeedFieldTime, 191 [AtomTagIssued] = FeedFieldTime, 192 [AtomTagPublished] = FeedFieldTime, 193 [AtomTagTitle] = FeedFieldTitle, 194 [AtomTagMediaDescription] = FeedFieldContent, 195 [AtomTagSummary] = FeedFieldContent, 196 [AtomTagContent] = FeedFieldContent, 197 [AtomTagId] = FeedFieldId, 198 [AtomTagLink] = -1, 199 [AtomTagLinkAlternate] = FeedFieldLink, 200 [AtomTagLinkEnclosure] = FeedFieldEnclosure, 201 [AtomTagAuthor] = -1, 202 [AtomTagAuthorName] = FeedFieldAuthor, 203 [AtomTagCategory] = FeedFieldCategory 204 }; 205 206 static const int FieldSeparator = '\t'; 207 /* separator for multiple values in a field, separator should be 1 byte */ 208 static const char *FieldMultiSeparator = "|"; 209 static struct uri baseuri; 210 static const char *baseurl; 211 212 static FeedContext ctx; 213 static XMLParser parser; /* XML parser state */ 214 static String attrispermalink, attrrel, attrtype, tmpstr; 215 216 static int 217 tagcmp(const void *v1, const void *v2) 218 { 219 return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name); 220 } 221 222 /* Unique tagid for parsed tag name. */ 223 static FeedTag * 224 gettag(enum FeedType feedtype, const char *name, size_t namelen) 225 { 226 FeedTag f, *r = NULL; 227 228 f.name = (char *)name; 229 230 switch (feedtype) { 231 case FeedTypeRSS: 232 r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]), 233 sizeof(rsstags[0]), tagcmp); 234 break; 235 case FeedTypeAtom: 236 r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]), 237 sizeof(atomtags[0]), tagcmp); 238 break; 239 default: 240 break; 241 } 242 243 return r; 244 } 245 246 static char * 247 ltrim(const char *s) 248 { 249 for (; isspace((unsigned char)*s); s++) 250 ; 251 return (char *)s; 252 } 253 254 static char * 255 rtrim(const char *s) 256 { 257 const char *e; 258 259 for (e = s + strlen(s); e > s && isspace((unsigned char)*(e - 1)); e--) 260 ; 261 return (char *)e; 262 } 263 264 /* Clear string only; don't free, prevents unnecessary reallocation. */ 265 static void 266 string_clear(String *s) 267 { 268 if (s->data) 269 s->data[0] = '\0'; 270 s->len = 0; 271 } 272 273 static void 274 string_buffer_realloc(String *s, size_t newlen) 275 { 276 size_t alloclen; 277 278 if (newlen > SIZE_MAX / 2) { 279 alloclen = SIZE_MAX; 280 } else { 281 for (alloclen = 64; alloclen <= newlen; alloclen *= 2) 282 ; 283 } 284 if (!(s->data = realloc(s->data, alloclen))) 285 err(1, "realloc"); 286 s->bufsiz = alloclen; 287 } 288 289 /* Append data to String, s->data and data may not overlap. */ 290 static void 291 string_append(String *s, const char *data, size_t len) 292 { 293 if (!len) 294 return; 295 296 if (s->len >= SIZE_MAX - len) { 297 errno = EOVERFLOW; 298 err(1, "realloc"); 299 } 300 301 /* check if allocation is necessary, never shrink the buffer. */ 302 if (s->len + len >= s->bufsiz) 303 string_buffer_realloc(s, s->len + len + 1); 304 memcpy(s->data + s->len, data, len); 305 s->len += len; 306 s->data[s->len] = '\0'; 307 } 308 309 /* Print text, encode TABs, newlines and '\', remove other whitespace. 310 * Remove leading and trailing whitespace. */ 311 static void 312 string_print_encoded(String *s) 313 { 314 const char *p, *e; 315 316 if (!s->data || !s->len) 317 return; 318 319 p = ltrim(s->data); 320 e = rtrim(p); 321 322 for (; *p && p != e; p++) { 323 switch (*p) { 324 case '\n': putchar('\\'); putchar('n'); break; 325 case '\\': putchar('\\'); putchar('\\'); break; 326 case '\t': putchar('\\'); putchar('t'); break; 327 default: 328 /* ignore control chars */ 329 if (!iscntrl((unsigned char)*p)) 330 putchar(*p); 331 break; 332 } 333 } 334 } 335 336 static void 337 printtrimmed(const char *s) 338 { 339 char *p, *e; 340 341 p = ltrim(s); 342 e = rtrim(p); 343 for (; *p && p != e; p++) { 344 if (isspace((unsigned char)*p)) 345 putchar(' '); /* any whitespace to space */ 346 else if (!iscntrl((unsigned char)*p)) 347 /* ignore other control chars */ 348 putchar(*p); 349 } 350 } 351 352 /* Print text, replace TABs, carriage return and other whitespace with ' '. 353 * Other control chars are removed. Remove leading and trailing whitespace. */ 354 static void 355 string_print_trimmed(String *s) 356 { 357 if (!s->data || !s->len) 358 return; 359 360 printtrimmed(s->data); 361 } 362 363 /* Print each field with trimmed whitespace, separated by '|'. */ 364 static void 365 string_print_trimmed_multi(String *s) 366 { 367 char *p, *e; 368 int c; 369 370 if (!s->data || !s->len) 371 return; 372 373 for (p = s->data; ; p = e + 1) { 374 if ((e = strstr(p, FieldMultiSeparator))) { 375 c = *e; 376 *e = '\0'; 377 printtrimmed(p); 378 *e = c; /* restore NUL byte to original character */ 379 fputs(FieldMultiSeparator, stdout); 380 } else { 381 printtrimmed(p); 382 break; 383 } 384 } 385 } 386 387 /* Print URL, if it's a relative URL then it uses the global `baseurl`. */ 388 static void 389 printuri(char *s) 390 { 391 char link[4096], *p, *e; 392 struct uri newuri, olduri; 393 int c, r = -1; 394 395 p = ltrim(s); 396 e = rtrim(p); 397 c = *e; 398 *e = '\0'; 399 400 if (baseurl && !uri_hasscheme(p) && 401 uri_parse(p, &olduri) != -1 && !olduri.proto[0] && 402 uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0]) 403 r = uri_format(link, sizeof(link), &newuri); 404 405 if (r >= 0 && (size_t)r < sizeof(link)) 406 printtrimmed(link); 407 else 408 printtrimmed(p); 409 410 *e = c; /* restore NUL byte to original character */ 411 } 412 413 /* Print URL, if it's a relative URL then it uses the global `baseurl`. */ 414 static void 415 string_print_uri(String *s) 416 { 417 if (!s->data || !s->len) 418 return; 419 420 printuri(s->data); 421 } 422 423 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */ 424 static void 425 string_print_timestamp(String *s) 426 { 427 long long t; 428 429 if (!s->data || !s->len) 430 return; 431 432 if (parsetime(s->data, &t) != -1) 433 printf("%lld", t); 434 } 435 436 /* Convert time fields. Returns a UNIX timestamp. */ 437 static long long 438 datetounix(long long year, int mon, int day, int hour, int min, int sec) 439 { 440 static const int secs_through_month[] = { 441 0, 31 * 86400, 59 * 86400, 90 * 86400, 442 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, 443 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; 444 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; 445 long long t; 446 447 if (year - 2ULL <= 136) { 448 leaps = (year - 68) >> 2; 449 if (!((year - 68) & 3)) { 450 leaps--; 451 is_leap = 1; 452 } else { 453 is_leap = 0; 454 } 455 t = 31536000 * (year - 70) + 86400 * leaps; 456 } else { 457 cycles = (year - 100) / 400; 458 rem = (year - 100) % 400; 459 if (rem < 0) { 460 cycles--; 461 rem += 400; 462 } 463 if (!rem) { 464 is_leap = 1; 465 } else { 466 if (rem >= 300) 467 centuries = 3, rem -= 300; 468 else if (rem >= 200) 469 centuries = 2, rem -= 200; 470 else if (rem >= 100) 471 centuries = 1, rem -= 100; 472 if (rem) { 473 leaps = rem / 4U; 474 rem %= 4U; 475 is_leap = !rem; 476 } 477 } 478 leaps += 97 * cycles + 24 * centuries - is_leap; 479 t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400; 480 } 481 t += secs_through_month[mon]; 482 if (is_leap && mon >= 2) 483 t += 86400; 484 t += 86400LL * (day - 1); 485 t += 3600LL * hour; 486 t += 60LL * min; 487 t += sec; 488 489 return t; 490 } 491 492 /* Get timezone from string, return time offset in seconds from UTC. 493 * NOTE: only parses timezones in RFC-822, many other timezone names are 494 * ambiguous anyway. 495 * ANSI and military zones are defined wrong in RFC822 and are unsupported, 496 * see note on RFC2822 4.3 page 32. */ 497 static long 498 gettzoffset(const char *s) 499 { 500 static struct { 501 char *name; 502 const int offhour; 503 } tzones[] = { 504 { "CDT", -5 * 3600 }, 505 { "CST", -6 * 3600 }, 506 { "EDT", -4 * 3600 }, 507 { "EST", -5 * 3600 }, 508 { "MDT", -6 * 3600 }, 509 { "MST", -7 * 3600 }, 510 { "PDT", -7 * 3600 }, 511 { "PST", -8 * 3600 }, 512 }; 513 const char *p; 514 long tzhour = 0, tzmin = 0; 515 size_t i; 516 517 for (; isspace((unsigned char)*s); s++) 518 ; 519 switch (*s) { 520 case '-': /* offset */ 521 case '+': 522 for (i = 0, p = s + 1; i < 2 && isdigit((unsigned char)*p); i++, p++) 523 tzhour = (tzhour * 10) + (*p - '0'); 524 if (*p == ':') 525 p++; 526 for (i = 0; i < 2 && isdigit((unsigned char)*p); i++, p++) 527 tzmin = (tzmin * 10) + (*p - '0'); 528 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1); 529 default: /* timezone name */ 530 for (i = 0; isalpha((unsigned char)s[i]); i++) 531 ; 532 if (i != 3) 533 return 0; 534 /* compare tz and adjust offset relative to UTC */ 535 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) { 536 if (!memcmp(s, tzones[i].name, 3)) 537 return tzones[i].offhour; 538 } 539 } 540 return 0; 541 } 542 543 /* Parse time string `s` into the UNIX timestamp `tp`. 544 Returns 0 on success or -1 on failure. */ 545 static int 546 parsetime(const char *s, long long *tp) 547 { 548 static struct { 549 char *name; 550 int len; 551 } mons[] = { 552 { STRP("January"), }, 553 { STRP("February"), }, 554 { STRP("March"), }, 555 { STRP("April"), }, 556 { STRP("May"), }, 557 { STRP("June"), }, 558 { STRP("July"), }, 559 { STRP("August"), }, 560 { STRP("September"), }, 561 { STRP("October"), }, 562 { STRP("November"), }, 563 { STRP("December"), }, 564 }; 565 int va[6] = { 0 }, i, j, v, vi; 566 size_t m; 567 568 for (; isspace((unsigned char)*s); s++) 569 ; 570 if (!isdigit((unsigned char)*s) && !isalpha((unsigned char)*s)) 571 return -1; 572 573 if (isdigit((unsigned char)s[0]) && 574 isdigit((unsigned char)s[1]) && 575 isdigit((unsigned char)s[2]) && 576 isdigit((unsigned char)s[3])) { 577 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */ 578 vi = 0; 579 } else { 580 /* format: "[%a, ]%d %b %Y %H:%M:%S" */ 581 /* parse "[%a, ]%d %b %Y " part, then use time parsing as above */ 582 for (; isalpha((unsigned char)*s); s++) 583 ; 584 for (; isspace((unsigned char)*s); s++) 585 ; 586 if (*s == ',') 587 s++; 588 for (; isspace((unsigned char)*s); s++) 589 ; 590 for (v = 0, i = 0; i < 2 && isdigit((unsigned char)*s); s++, i++) 591 v = (v * 10) + (*s - '0'); 592 va[2] = v; /* day */ 593 for (; isspace((unsigned char)*s); s++) 594 ; 595 /* end of word month */ 596 for (j = 0; isalpha((unsigned char)s[j]); j++) 597 ; 598 /* check month name */ 599 if (j < 3 || j > 9) 600 return -1; /* month cannot match */ 601 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) { 602 /* abbreviation (3 length) or long name */ 603 if ((j == 3 || j == mons[m].len) && 604 !strncasecmp(mons[m].name, s, j)) { 605 va[1] = m + 1; 606 s += j; 607 break; 608 } 609 } 610 if (m >= 12) 611 return -1; /* no month found */ 612 for (; isspace((unsigned char)*s); s++) 613 ; 614 for (v = 0, i = 0; i < 4 && isdigit((unsigned char)*s); s++, i++) 615 v = (v * 10) + (*s - '0'); 616 /* obsolete short year: RFC2822 4.3 */ 617 if (i <= 3) 618 v += (v >= 0 && v <= 49) ? 2000 : 1900; 619 va[0] = v; /* year */ 620 for (; isspace((unsigned char)*s); s++) 621 ; 622 /* parse only regular time part, see below */ 623 vi = 3; 624 } 625 626 /* parse time parts (and possibly remaining date parts) */ 627 for (; *s && vi < 6; vi++) { 628 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) && 629 isdigit((unsigned char)*s); s++, i++) { 630 v = (v * 10) + (*s - '0'); 631 } 632 va[vi] = v; 633 634 if ((vi < 2 && *s == '-') || 635 (vi == 2 && (*s == 'T' || isspace((unsigned char)*s))) || 636 (vi > 2 && *s == ':')) 637 s++; 638 } 639 640 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */ 641 if (*s == '.') { 642 for (s++; isdigit((unsigned char)*s); s++) 643 ; 644 } 645 646 /* invalid range */ 647 if (va[0] < 0 || va[0] > 9999 || 648 va[1] < 1 || va[1] > 12 || 649 va[2] < 1 || va[2] > 31 || 650 va[3] < 0 || va[3] > 23 || 651 va[4] < 0 || va[4] > 59 || 652 va[5] < 0 || va[5] > 60) /* allow leap second */ 653 return -1; 654 655 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) - 656 gettzoffset(s); 657 658 return 0; 659 } 660 661 static void 662 printfields(void) 663 { 664 string_print_timestamp(&ctx.fields[FeedFieldTime].str); 665 putchar(FieldSeparator); 666 string_print_trimmed(&ctx.fields[FeedFieldTitle].str); 667 putchar(FieldSeparator); 668 string_print_uri(&ctx.fields[FeedFieldLink].str); 669 putchar(FieldSeparator); 670 string_print_encoded(&ctx.fields[FeedFieldContent].str); 671 putchar(FieldSeparator); 672 fputs(contenttypes[ctx.contenttype], stdout); 673 putchar(FieldSeparator); 674 string_print_trimmed(&ctx.fields[FeedFieldId].str); 675 putchar(FieldSeparator); 676 string_print_trimmed(&ctx.fields[FeedFieldAuthor].str); 677 putchar(FieldSeparator); 678 string_print_uri(&ctx.fields[FeedFieldEnclosure].str); 679 putchar(FieldSeparator); 680 string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str); 681 putchar('\n'); 682 } 683 684 static int 685 istag(const char *name, size_t len, const char *name2, size_t len2) 686 { 687 return (len == len2 && !strcasecmp(name, name2)); 688 } 689 690 static int 691 isattr(const char *name, size_t len, const char *name2, size_t len2) 692 { 693 return (len == len2 && !strcasecmp(name, name2)); 694 } 695 696 static void 697 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, 698 const char *v, size_t vl) 699 { 700 /* handles transforming inline XML to data */ 701 if (ISINCONTENT(ctx)) { 702 if (ctx.contenttype == ContentTypeHTML) 703 xmldata(p, v, vl); 704 return; 705 } 706 707 if (!ctx.tag.id) 708 return; 709 710 /* content-type may be: Atom: text, xhtml, html or mime-type. 711 MRSS (media:description): plain, html. */ 712 if (ISCONTENTTAG(ctx)) { 713 if (isattr(n, nl, STRP("type"))) 714 string_append(&attrtype, v, vl); 715 return; 716 } 717 718 if (ctx.feedtype == FeedTypeRSS) { 719 if (ctx.tag.id == RSSTagEnclosure && 720 isattr(n, nl, STRP("url"))) { 721 string_append(&tmpstr, v, vl); 722 } else if (ctx.tag.id == RSSTagGuid && 723 isattr(n, nl, STRP("ispermalink"))) { 724 string_append(&attrispermalink, v, vl); 725 } 726 } else if (ctx.feedtype == FeedTypeAtom) { 727 if (ctx.tag.id == AtomTagLink) { 728 if (isattr(n, nl, STRP("rel"))) { 729 string_append(&attrrel, v, vl); 730 } else if (isattr(n, nl, STRP("href"))) { 731 string_append(&tmpstr, v, vl); 732 } 733 } else if (ctx.tag.id == AtomTagCategory && 734 isattr(n, nl, STRP("term"))) { 735 string_append(&tmpstr, v, vl); 736 } 737 } 738 } 739 740 static void 741 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, 742 const char *data, size_t datalen) 743 { 744 char buf[16]; 745 int len; 746 747 /* handles transforming inline XML to data */ 748 if (ISINCONTENT(ctx)) { 749 if (ctx.contenttype == ContentTypeHTML) 750 xmldata(p, data, datalen); 751 return; 752 } 753 754 if (!ctx.tag.id) 755 return; 756 757 /* try to translate entity, else just pass as data to 758 * xmldata handler. */ 759 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) 760 xmlattr(p, t, tl, n, nl, buf, (size_t)len); 761 else 762 xmlattr(p, t, tl, n, nl, data, datalen); 763 } 764 765 static void 766 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) 767 { 768 if (ISINCONTENT(ctx)) { 769 if (ctx.contenttype == ContentTypeHTML) { 770 /* handles transforming inline XML to data */ 771 xmldata(p, "\"", 1); 772 ctx.attrcount = 0; 773 } 774 return; 775 } 776 } 777 778 static void 779 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) 780 { 781 if (ISINCONTENT(ctx)) { 782 if (ctx.contenttype == ContentTypeHTML) { 783 /* handles transforming inline XML to data */ 784 if (!ctx.attrcount) 785 xmldata(p, " ", 1); 786 ctx.attrcount++; 787 xmldata(p, n, nl); 788 xmldata(p, "=\"", 2); 789 } 790 return; 791 } 792 793 if (attrispermalink.len && isattr(n, nl, STRP("ispermalink"))) 794 string_clear(&attrispermalink); 795 else if (attrrel.len && isattr(n, nl, STRP("rel"))) 796 string_clear(&attrrel); 797 else if (attrtype.len && isattr(n, nl, STRP("type"))) 798 string_clear(&attrtype); 799 else if (tmpstr.len && 800 (isattr(n, nl, STRP("href")) || 801 isattr(n, nl, STRP("term")) || 802 isattr(n, nl, STRP("url")))) 803 string_clear(&tmpstr); /* use the last value for multiple attribute values */ 804 } 805 806 static void 807 xmldata(XMLParser *p, const char *s, size_t len) 808 { 809 if (!ctx.field) 810 return; 811 812 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) 813 string_append(&tmpstr, s, len); 814 else 815 string_append(ctx.field, s, len); 816 } 817 818 static void 819 xmldataentity(XMLParser *p, const char *data, size_t datalen) 820 { 821 char buf[16]; 822 int len; 823 824 if (!ctx.field) 825 return; 826 827 /* try to translate entity, else just pass as data to 828 * xmldata handler. */ 829 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) 830 xmldata(p, buf, (size_t)len); 831 else 832 xmldata(p, data, datalen); 833 } 834 835 static void 836 xmltagstart(XMLParser *p, const char *t, size_t tl) 837 { 838 FeedTag *f; 839 840 if (ISINCONTENT(ctx)) { 841 if (ctx.contenttype == ContentTypeHTML) { 842 ctx.attrcount = 0; 843 xmldata(p, "<", 1); 844 xmldata(p, t, tl); 845 } 846 return; 847 } 848 849 /* start of RSS or Atom item / entry */ 850 if (ctx.feedtype == FeedTypeNone) { 851 if (istag(t, tl, STRP("entry"))) 852 ctx.feedtype = FeedTypeAtom; 853 else if (istag(t, tl, STRP("item"))) 854 ctx.feedtype = FeedTypeRSS; 855 return; 856 } 857 858 /* field tagid already set or nested tags. */ 859 if (ctx.tag.id) { 860 /* nested <author><name> for Atom */ 861 if (ctx.tag.id == AtomTagAuthor && 862 istag(t, tl, STRP("name"))) { 863 memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag)); 864 } else { 865 return; /* other nested tags are not allowed: return */ 866 } 867 } 868 869 /* in item */ 870 if (ctx.tag.id == TagUnknown) { 871 if (!(f = gettag(ctx.feedtype, t, tl))) 872 f = ¬ag; 873 memcpy(&(ctx.tag), f, sizeof(ctx.tag)); 874 } 875 876 ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent); 877 string_clear(&attrispermalink); 878 string_clear(&attrrel); 879 string_clear(&attrtype); 880 } 881 882 static void 883 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) 884 { 885 enum TagId tagid; 886 887 if (ISINCONTENT(ctx)) { 888 if (ctx.contenttype == ContentTypeHTML) { 889 if (isshort) 890 xmldata(p, "/>", 2); 891 else 892 xmldata(p, ">", 1); 893 } 894 return; 895 } 896 897 /* set tag type based on it's attribute value */ 898 if (ctx.tag.id == RSSTagGuid) { 899 /* if empty the default is "true" */ 900 if (!attrispermalink.len || 901 isattr(attrispermalink.data, attrispermalink.len, STRP("true"))) 902 ctx.tag.id = RSSTagGuidPermalinkTrue; 903 else 904 ctx.tag.id = RSSTagGuidPermalinkFalse; 905 } else if (ctx.tag.id == AtomTagLink) { 906 /* empty or "alternate": other types could be 907 "enclosure", "related", "self" or "via" */ 908 if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate"))) 909 ctx.tag.id = AtomTagLinkAlternate; 910 else if (isattr(attrrel.data, attrrel.len, STRP("enclosure"))) 911 ctx.tag.id = AtomTagLinkEnclosure; 912 else 913 ctx.tag.id = AtomTagLink; /* unknown */ 914 } 915 916 tagid = ctx.tag.id; 917 918 /* map tag type to field: unknown or lesser priority is ignored, 919 when tags of the same type are repeated only the first is used. */ 920 if (fieldmap[tagid] == -1 || 921 (!ISFEEDFIELDMULTI(fieldmap[tagid]) && 922 tagid <= ctx.fields[fieldmap[tagid]].tagid)) { 923 return; 924 } 925 926 if (ctx.iscontenttag) { 927 ctx.iscontent = 1; 928 ctx.iscontenttag = 0; 929 930 /* detect content-type based on type attribute */ 931 if (attrtype.len) { 932 if (isattr(attrtype.data, attrtype.len, STRP("html")) || 933 isattr(attrtype.data, attrtype.len, STRP("xhtml")) || 934 isattr(attrtype.data, attrtype.len, STRP("text/html")) || 935 isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) || 936 isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml"))) 937 ctx.contenttype = ContentTypeHTML; 938 else /* unknown: handle as base64 text data */ 939 ctx.contenttype = ContentTypePlain; 940 } else { 941 /* default content-type */ 942 if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription) 943 ctx.contenttype = ContentTypeHTML; 944 else 945 ctx.contenttype = ContentTypePlain; 946 } 947 } 948 949 ctx.field = &(ctx.fields[fieldmap[tagid]].str); 950 ctx.fields[fieldmap[tagid]].tagid = tagid; 951 952 /* clear field if it is overwritten (with a priority order) for the new 953 value, if the field can have multiple values then do not clear it. */ 954 if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) 955 string_clear(ctx.field); 956 } 957 958 static void 959 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) 960 { 961 size_t i; 962 963 if (ctx.feedtype == FeedTypeNone) 964 return; 965 966 if (ISINCONTENT(ctx)) { 967 /* not close content field */ 968 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) { 969 if (!isshort && ctx.contenttype == ContentTypeHTML) { 970 xmldata(p, "</", 2); 971 xmldata(p, t, tl); 972 xmldata(p, ">", 1); 973 } 974 return; 975 } 976 } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) { 977 /* matched tag end: close it */ 978 /* copy also to the link field if the attribute isPermaLink="true" 979 and it is not set by a tag with higher prio. */ 980 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field && 981 ctx.tag.id > ctx.fields[FeedFieldLink].tagid) { 982 string_clear(&ctx.fields[FeedFieldLink].str); 983 string_append(&ctx.fields[FeedFieldLink].str, 984 ctx.field->data, ctx.field->len); 985 ctx.fields[FeedFieldLink].tagid = ctx.tag.id; 986 } 987 } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom && 988 istag(t, tl, STRP("entry"))) || /* Atom */ 989 (ctx.feedtype == FeedTypeRSS && 990 istag(t, tl, STRP("item"))))) /* RSS */ 991 { 992 /* end of RSS or Atom entry / item */ 993 printfields(); 994 995 /* clear strings */ 996 for (i = 0; i < FeedFieldLast; i++) { 997 string_clear(&ctx.fields[i].str); 998 ctx.fields[i].tagid = TagUnknown; 999 } 1000 ctx.contenttype = ContentTypeNone; 1001 /* allow parsing of Atom and RSS concatenated in one XML stream. */ 1002 ctx.feedtype = FeedTypeNone; 1003 } else { 1004 return; /* not end of field */ 1005 } 1006 1007 /* temporary string: for fields that cannot be processed 1008 directly and need more context, for example by it's tag 1009 attributes, like the Atom link rel="alternate|enclosure". */ 1010 if (tmpstr.len && ctx.field) { 1011 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) { 1012 if (ctx.field->len) 1013 string_append(ctx.field, FieldMultiSeparator, 1); 1014 string_append(ctx.field, tmpstr.data, tmpstr.len); 1015 } else { 1016 string_clear(ctx.field); 1017 string_append(ctx.field, tmpstr.data, tmpstr.len); 1018 } 1019 } 1020 1021 /* close field */ 1022 string_clear(&tmpstr); /* reuse and clear temporary string */ 1023 1024 if (ctx.tag.id == AtomTagAuthorName) 1025 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */ 1026 else 1027 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); 1028 1029 ctx.iscontent = 0; 1030 ctx.field = NULL; 1031 } 1032 1033 int 1034 main(int argc, char *argv[]) 1035 { 1036 if (pledge("stdio", NULL) == -1) 1037 err(1, "pledge"); 1038 1039 if (argc > 1) { 1040 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0]) 1041 baseurl = argv[1]; 1042 else 1043 errx(1, "baseurl incorrect or too long"); 1044 } 1045 1046 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); 1047 1048 parser.xmlattr = xmlattr; 1049 parser.xmlattrentity = xmlattrentity; 1050 parser.xmlattrend = xmlattrend; 1051 parser.xmlattrstart = xmlattrstart; 1052 parser.xmlcdata = xmldata; 1053 parser.xmldata = xmldata; 1054 parser.xmldataentity = xmldataentity; 1055 parser.xmltagend = xmltagend; 1056 parser.xmltagstart = xmltagstart; 1057 parser.xmltagstartparsed = xmltagstartparsed; 1058 1059 /* NOTE: getnext is defined in xml.h for inline optimization */ 1060 xml_parse(&parser); 1061 1062 return 0; 1063 }