Mailing List ArchiveSupport open source code!
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]Re: stripping HTML tags with Perl
- To: tlug@example.com
- Subject: Re: stripping HTML tags with Perl
- From: Jake Morrison <jacob.morrison@example.com>
- Date: Tue, 05 Dec 2000 11:47:34 +0800
- Content-Transfer-Encoding: 7bit
- Content-Type: text/plain; charset=us-ascii
- Organization: Syntegra Asia
- References: <20001204133053G.poulin@example.com> <3A2C33F7.FA683194@example.com> <20001204175106Q.poulin@example.com>
- Reply-To: tlug@example.com
- Resent-From: tlug@example.com
- Resent-Message-ID: <GW1TZB.A.2iC.OiGL6@example.com>
- Resent-Sender: tlug-request@example.com
This is answered in the FAQ that comes with Perl: ---------------------------------------------- How do I remove HTML from a string? The most correct way (albeit not the fastest) is to use HTML::Parser from CPAN. Another mostly correct way is to use HTML::FormatText which not only removes HTML but also attempts to do a little simple formatting of the resulting plain text. Many folks attempt a simple-minded regular expression approach, like s/<.*?>//g, but that fails in many cases because the tags may continue over line breaks, they may contain quoted angle-brackets, or HTML comment may be present. Plus folks forget to convert entities, like < for example. Here's one ``simple-minded'' approach, that works for most files: #!/usr/bin/perl -p0777 s/<(?:[^>'"]*|(['"]).*?\1)*>//gs If you want a more complete solution, see the 3-stage striphtml program in http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/striphtml.gz . Here are some tricky cases that you should think about when picking a solution: <IMG SRC = "foo.gif" ALT = "A > B"> <IMG SRC = "foo.gif" ALT = "A > B"> <!-- <A comment> --> <script>if (a<b && a>c)</script> <# Just data #> <![INCLUDE CDATA [ >>>>>>>>>>>> ]]> If HTML comments include other tags, those solutions would also break on text like this: <!-- This section commented out. <B>You can't see me!</B> --> --------------------------------------------------------- I had to do this a while back, but without having perl availible on the target machine. So I used the Perl Compatible Regular Expressions C library (ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/). Pretty handy. But still considerably more painful than the real thing. Just for fun, here is the resulting program: #include <stdio.h> #include <malloc.h> #include <assert.h> #include <pcre.h> #define VECSIZE 60 #define BUFLEN 10000 int debugMode = 0; void print_re_error(int pairs) { switch (pairs) { case PCRE_ERROR_NOMATCH: fprintf(stderr, "No match.\n"); break; case PCRE_ERROR_NULL: fprintf(stderr, "One or more NULL input params.\n"); break; case PCRE_ERROR_BADOPTION: fprintf(stderr, "Invalid option specified.\n"); break; case PCRE_ERROR_BADMAGIC: fprintf(stderr, "Bad magic.\n"); break; case PCRE_ERROR_UNKNOWN_NODE: fprintf(stderr, "Unknown node.\n"); break; case PCRE_ERROR_NOMEMORY: fprintf(stderr, "Out of memory.\n"); break; default: fprintf(stderr, "Unknown error %d.\n", pairs); } } pcre* re_compile(const char* re_str, int options) { pcre* re = NULL; const char *errptr = NULL; int erroffset = 0; #ifdef RE_DEBUG printf("re_compile> re_str: %s\n", re_str); #endif re = pcre_compile(re_str, options, &errptr, &erroffset, NULL); if (re == NULL) { fprintf(stderr, "pcre_compile: error at offset %d: %s\n", erroffset, errptr); return ((pcre*)NULL); } #ifdef RE_DEBUG printf("re_compile> pcre_compile succeeded\n"); #endif return (re); } int strip_html(char* dest, const char* src, size_t src_len, pcre *re, pcre_extra *extra, int debug); int main (int argc, char *argv[]) { pcre_extra *extra = NULL; const char* study_error = NULL; pcre *html_re = NULL; const char* html_re_str = "<(?:[^>'\"]*|(['\"]).*?\\1)*>"; int rc; char source[BUFLEN]; char dest[BUFLEN]; html_re = re_compile(html_re_str, 0); if (html_re == NULL) { exit (3); } extra = pcre_study(html_re, 0, &study_error); if (study_error != NULL) { fprintf(stderr, "pcre_study: %s\n", study_error); } while ( (fgets(source, BUFLEN, stdin)) != NULL) { rc = strip_html(dest, source, strlen(source), html_re, extra, debugMode); if (rc != 0) { fprintf(stderr, "Problem parsing string %s\n", source); continue; } printf("%s", dest); } pcre_free(extra); pcre_free(html_re); exit (0); } /* Strip HTML tags from source, copying to dest buffer */ int strip_html(char* dest, const char* src, size_t src_len, pcre *re, pcre_extra *extra, int debug) { int ovector[VECSIZE]; /* List of matches */ int pairs = 0; /* Number of matches */ int start_offset = 0; /* Point in src string to start searching */ int flags = 0; /* Regex matching flags */ char* dest_cur = dest; /* Pointer to current location in dest buffer */ #ifdef DEBUG int i; /* Utility counter */ char buf[BUFLEN]; /* buffer used for debugging */ #endif assert(re); #ifdef DEBUG fprintf(stderr, "strip_html> parsing string %s\n", src); #endif if (!src && src[0]) { #ifdef DEBUG fprintf(stderr, "strip_html> Null input string\n"); #endif return (0); } while (1) { pairs = pcre_exec(re, extra, src, src_len, start_offset, flags, ovector, VECSIZE); if (pairs < 0) { if (pairs == PCRE_ERROR_NOMATCH) { #ifdef DEBUG fprintf(stderr, "strip_html> No match for string '%s'.\n", src + start_offset); #endif /* No HTML found in remainder of string, just copy input to output */ strcpy(dest_cur, src + start_offset); return (0); } else { if (debug) { fprintf(stderr, "strip_html: Problem running regex for string %s: ", src); print_re_error(pairs); } return (1); } } else if (pairs == 0) { /* This should not happen */ #ifdef DEBUG fprintf(stderr, "strip_html> More than %d matches for string %s\n", VECSIZE / 3, src); #endif pairs = VECSIZE / 3; } else { /* This should always be 1 for this regex */ #ifdef DEBUG fprintf(stderr, "strip_html> Matched %d pair(s)\n", pairs); #endif } #ifdef DEBUG fprintf(stderr, "strip_html> ovector[0]: %d\n", ovector[0]); fprintf(stderr, "strip_html> ovector[1]: %d\n", ovector[1]); for (i = 0; i < pairs; i++) { int len; fprintf(stderr, "strip_html> pair: %d\n", i); len = pcre_copy_substring(src, ovector, pairs, i, buf, sizeof(buf)); if (len < 0) { fprintf(stderr, "strip_html> Problem getting substring %d: %d\n", i, len); } else { fprintf(stderr, "strip_html> Matched string %d: %s\n", i, buf); } } #endif /* Copy part before the match */ memcpy(dest_cur, src + start_offset, ovector[0] - start_offset); /* Update current location in destination buffer */ dest_cur += ovector[0] - start_offset; #ifdef DEBUG fprintf(stderr, "dest: %s\n", dest); #endif /* Update offset to point after data */ start_offset = ovector[1]; } return (0); }
- References:
- stripping HTML tags with Perl
- From: "Drew C. Poulin" <poulin@example.com>
- Re: stripping HTML tags with Perl
- From: Fredric Fredricson <fredric.fredriksson@example.com>
- Re: stripping HTML tags with Perl
- From: "Drew C. Poulin" <poulin@example.com>
Home | Main Index | Thread Index
- Prev by Date: Re: stripping HTML tags with Perl
- Next by Date: Re: stripping HTML tags with Perl
- Prev by thread: Re: stripping HTML tags with Perl
- Next by thread: stripping HTML tags with Perl
- Index(es):
Home Page Mailing List Linux and Japan TLUG Members Links