You want to erase all the HTML tags "<>" and leave only the contents.

Asked 2 years ago, Updated 2 years ago, 43 views

from html.parser import HTMLParser
br = urllib.request.urlopen('http://www.google.com')
html = str(br.read())
print(html)

You don't just print out HTML from the same source.

<a class=gb1 href="http://www.youtube.com/?gl=KR&tab=w1">I only want to extract YouTube with tags and elements removed from YouTube</a>.

How do I remove all tags and elements?

html5 python

2022-09-22 22:14

1 Answers

Striping HTML varies slightly depending on the Python version.

from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

from html.parser import HTMLParser
import urllib.request

#Function code appropriate for version here

br = urllib.request.urlopen('http://www.google.com')
html = br.read()
print(strip_tags(html))


2022-09-22 22:14

If you have any answers or tips


© 2024 OneMinuteCode. All rights reserved.