Re: stripping HTML tags with Perl

To: tlug@example.com
Subject: Re: stripping HTML tags with Perl
From: Jake Morrison <jacob.morrison@example.com>
Date: Tue, 05 Dec 2000 11:47:34 +0800
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=us-ascii
Organization: Syntegra Asia
References: <20001204133053G.poulin@example.com> <3A2C33F7.FA683194@example.com> <20001204175106Q.poulin@example.com>
Reply-To: tlug@example.com
Resent-From: tlug@example.com
Resent-Message-ID: <GW1TZB.A.2iC.OiGL6@example.com>
Resent-Sender: tlug-request@example.com

This is answered in the FAQ that comes with Perl:

----------------------------------------------
How do I remove HTML from a string?

The most correct way (albeit not the fastest) is to use HTML::Parser from CPAN. Another
mostly correct way is to use HTML::FormatText which not only removes HTML but also attempts
to do a little simple formatting of the resulting plain text.

Many folks attempt a simple-minded regular expression approach, like s/<.*?>//g, but that
fails in many cases because the tags may continue over line breaks, they may contain quoted
angle-brackets, or HTML comment may be present. Plus folks forget to convert entities, like
&lt; for example.

Here's one ``simple-minded'' approach, that works for most files:

    #!/usr/bin/perl -p0777
    s/<(?:[^>'"]*|(['"]).*?\1)*>//gs

If you want a more complete solution, see the 3-stage striphtml program in
http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/striphtml.gz .

Here are some tricky cases that you should think about when picking a solution:

    <IMG SRC = "foo.gif" ALT = "A > B">

    <IMG SRC = "foo.gif"
         ALT = "A > B">

    <!-- <A comment> -->

    <script>if (a<b && a>c)</script>

    <# Just data #>

    <![INCLUDE CDATA [ >>>>>>>>>>>> ]]>

If HTML comments include other tags, those solutions would also break on text like this:

    <!-- This section commented out.
        <B>You can't see me!</B>
    -->

---------------------------------------------------------
I had to do this a while back, but without having perl
availible on the target machine. So I used the Perl 
Compatible Regular Expressions C library 
(ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/).

Pretty handy. But still considerably more painful than 
the real thing. 

Just for fun, here is the resulting program:

#include <stdio.h>
#include <malloc.h>
#include <assert.h>

#include <pcre.h>

#define VECSIZE 60 
#define BUFLEN 10000

int debugMode = 0;

void 
print_re_error(int pairs) 
{
	switch (pairs) {
	case PCRE_ERROR_NOMATCH:
		fprintf(stderr, "No match.\n");
		break;
	case PCRE_ERROR_NULL:
		fprintf(stderr, "One or more NULL input params.\n");
		break;
	case PCRE_ERROR_BADOPTION:
		fprintf(stderr, "Invalid option specified.\n");
		break;
	case PCRE_ERROR_BADMAGIC:
		fprintf(stderr, "Bad magic.\n");
		break;
	case PCRE_ERROR_UNKNOWN_NODE:
		fprintf(stderr, "Unknown node.\n");
		break;
	case PCRE_ERROR_NOMEMORY:
		fprintf(stderr, "Out of memory.\n");
		break;
	default:
		fprintf(stderr, "Unknown error %d.\n", pairs);
	}
}

pcre*
re_compile(const char* re_str, int options) 
{
	pcre* re = NULL;
	const char *errptr = NULL;
	int erroffset = 0;

#ifdef RE_DEBUG
	printf("re_compile> re_str: %s\n", re_str);
#endif
	re = pcre_compile(re_str, options, &errptr, &erroffset, NULL);
	if (re == NULL) {
		fprintf(stderr, "pcre_compile: error at offset %d: %s\n", erroffset, errptr);
		return ((pcre*)NULL);
	}
#ifdef RE_DEBUG
	printf("re_compile> pcre_compile succeeded\n");
#endif
	return (re); 
}

int 
strip_html(char* dest, const char* src, size_t src_len, 
		   pcre *re, pcre_extra *extra, int debug);

int 
main (int argc, char *argv[])
{
	pcre_extra *extra = NULL;
	const char* study_error = NULL;
	pcre *html_re = NULL;

	const char* html_re_str =  "<(?:[^>'\"]*|(['\"]).*?\\1)*>";
	int rc;

	char source[BUFLEN]; 
	char dest[BUFLEN]; 

	html_re = re_compile(html_re_str, 0);
	if (html_re == NULL) {
		exit (3);
	}
	extra = pcre_study(html_re, 0, &study_error);
	if (study_error != NULL) {
		fprintf(stderr, "pcre_study: %s\n", study_error);
	}

	while ( (fgets(source, BUFLEN, stdin)) != NULL) { 
		rc = strip_html(dest, source, strlen(source), 
						html_re, extra, debugMode);
		if (rc != 0) {
			fprintf(stderr, "Problem parsing string %s\n", source);
			continue;
		}
		printf("%s", dest);
	} 

	pcre_free(extra);
	pcre_free(html_re);

	exit (0);
}

/* Strip HTML tags from source, copying to dest buffer */
int
strip_html(char* dest, const char* src, size_t src_len, 
		   pcre *re, pcre_extra *extra, int debug)
{
	int ovector[VECSIZE];  /* List of matches */
	int pairs = 0;         /* Number of matches */
	int start_offset = 0; /* Point in src string to start searching */
	int flags = 0; /* Regex matching flags */
	char* dest_cur = dest; /* Pointer to current location in dest buffer */

#ifdef DEBUG
	int i; /* Utility counter */
	char buf[BUFLEN]; /* buffer used for debugging */
#endif

	assert(re);

#ifdef DEBUG
		fprintf(stderr, "strip_html> parsing string %s\n", src); 
#endif

	if (!src && src[0]) {
#ifdef DEBUG
		fprintf(stderr, "strip_html> Null input string\n"); 
#endif
		return (0);
	}

	while (1) {
		pairs = pcre_exec(re, extra, src, src_len, start_offset, 
						  flags, ovector, VECSIZE);  
		if (pairs < 0) {
			if (pairs == PCRE_ERROR_NOMATCH) {
#ifdef DEBUG
				fprintf(stderr, 
						"strip_html> No match for string '%s'.\n", 
						src + start_offset);
#endif

				/* No HTML found in remainder of string, 
				   just copy input to output */
				strcpy(dest_cur, src + start_offset);
				return (0);
			}
			else {
				if (debug) {
					fprintf(stderr, 
						"strip_html: Problem running regex for string %s: ", 
						src);
					print_re_error(pairs);
				}
				return (1);
			}
		}
		else if (pairs == 0) {
			/* This should not happen */
#ifdef DEBUG
				fprintf(stderr, 
					"strip_html> More than %d matches for string %s\n", 
					VECSIZE / 3, src);
#endif
			pairs = VECSIZE / 3;
		}
		else {
			/* This should always be 1 for this regex */
#ifdef DEBUG
			fprintf(stderr, "strip_html> Matched %d pair(s)\n", pairs); 
#endif
		}

#ifdef DEBUG
		fprintf(stderr, "strip_html> ovector[0]: %d\n", ovector[0]);
		fprintf(stderr, "strip_html> ovector[1]: %d\n", ovector[1]);

		for (i = 0; i < pairs; i++) {
			int len;
			fprintf(stderr, "strip_html> pair: %d\n", i);
			len = pcre_copy_substring(src, ovector, pairs, i, 
									  buf, sizeof(buf));
			if (len < 0) {
				fprintf(stderr, 
						"strip_html> Problem getting substring %d: %d\n", i, len);
			}
			else {
				fprintf(stderr, 
						"strip_html> Matched string %d: %s\n", i, buf);
			}
		}
#endif

		/* Copy part before the match */
		memcpy(dest_cur, src + start_offset, ovector[0] - start_offset); 

		/* Update current location in destination buffer */
		dest_cur += ovector[0] - start_offset;

#ifdef DEBUG
		fprintf(stderr, "dest: %s\n", dest);
#endif

		/* Update offset to point after data */
		start_offset = ovector[1];
	}
  
	return (0);
}

References:
- stripping HTML tags with Perl
  - From: "Drew C. Poulin" <poulin@example.com>
- Re: stripping HTML tags with Perl
  - From: Fredric Fredricson <fredric.fredriksson@example.com>
- Re: stripping HTML tags with Perl
  - From: "Drew C. Poulin" <poulin@example.com>

Prev by Date: Re: stripping HTML tags with Perl
Next by Date: Re: stripping HTML tags with Perl
Prev by thread: Re: stripping HTML tags with Perl
Next by thread: stripping HTML tags with Perl
Index(es):
- Date
- Thread

Home | Main Index | Thread Index