Source code for crawling.parsers

#!/usr/bin/python3

"""Data of webpage are geted by the python html.parser.
Here is two parser, the first one for all informations and
the sencond one only for encoding."""

from html.parser import HTMLParser
from html.entities import name2codepoint, html5

from swiftea_bot.data import LIST_TAG_WORDS, LIST_ALONE_TAG_WORDS

[docs]class ExtractData(HTMLParser): """Html parser for extract data. self.object : the type of text for title, description and keywords\n dict(attrs).get('content') : convert attrs in a dict and retrun the value Data could be extract: title\n language\n description\n links with nofollow and noindex\n stylesheet\n favicon\n keywords: h1, h2, h3, strong, em """ def __init__(self): HTMLParser.__init__(self) self.links = list() # List of links self.keywords = '' # All keywords in a string self.is_title = False # True is data are the title self.word1 = False # True if data are words self.word2 = False # True if data are words and tag is a tag use in and out other word tags self.css = False # True if there is a css link in the source code self.h1 = False # True if parsing the title of webpage self.first_title = '' # The first title (h1) of the web site self.description = self.language = self.title = self.favicon = ''
[docs] def re_init(self): """Call when met html tag, put back all variables to default.""" self.links = list() self.first_title = self.keywords = self.description = '' self.language = self.title = self.favicon = '' self.css = self.h1 = False self.is_title = False self.word1 = False self.word2 = False
[docs] def handle_starttag(self, tag, attrs): """Call when parser met a starting tag. :param tag: starting tag :type tag: str :param attrs: attributes: [('name', 'language'), ('content', 'fr')] :type attrs: list """ if tag =='html': self.re_init() if len(dict(attrs).get('lang', '')) >= 2: self.language = dict(attrs).get('lang').lower().strip()[:2] elif tag == 'a': url = can_append(dict(attrs).get('href'), dict(attrs).get('rel', '')) if url: self.links.append(url) elif tag == 'link': rel = dict(attrs).get('rel', '') if rel == 'stylesheet': # LINK REL="STYLESHEET" TYPE="text/css" self.css = True elif rel == 'icon' or rel == 'shortcut icon': # LINK REL="ICON" HREF="FAVICON.ICO" self.favicon = dict(attrs).get('href', '') elif tag == 'meta': language, description = meta(attrs) if language != str(): self.language = language if description != str(): self.description = description elif tag == 'title': self.is_title = True # It's about title if tag in LIST_TAG_WORDS: self.word1 = True if tag in LIST_ALONE_TAG_WORDS: # tag use in and out of tag from LIST_TAG_WORDS self.word2 = True if tag == 'h1' and self.first_title == '': self.h1 = True # It's about a h1
[docs] def handle_data(self, data): """Call when parser met data. :param tag: starting tag :type tag: str """ if self.is_title: self.title += data if self.word1 or self.word2: self.keywords += ' ' + data if self.h1: self.first_title = data
[docs] def handle_endtag(self, tag): """Call when parser met a ending tag. :param tag: starting tag :type tag: str :param attrs: attributes :type attrs: list """ if tag == 'title': self.is_title = False if tag == 'h1': self.h1 = False if tag in LIST_TAG_WORDS: self.word1 = False if tag in LIST_ALONE_TAG_WORDS: # tag use in and out of tag from LIST_TAG_WORDS self.word2 = False
[docs] def handle_entityref(self, name): try: letter = chr(name2codepoint[name]) except KeyError: try: letter = html5[name + ';'] except KeyError: pass else: if self.is_title: self.title += letter
[docs] def handle_charref(self, name): if name.startswith('x'): letter = chr(int(name[1:], 16)) else: letter = chr(int(name)) if self.is_title: self.title += letter
[docs]def meta(attrs): """Manager searches in meat tag. Can find: <meta name='description' content='my description'/>\n <meta name='language' content='en'/>\n <meta http-equiv='content-language' content='en'/>\n :apram attrs: attributes of meta tag :type attrs: list :return: language, description, object """ description = str() language = str() name = dict(attrs).get('name', '').lower() content = dict(attrs).get('content') if content: if name == 'description': description = content elif name == 'language': language = content.lower().strip()[:2] httpequiv = dict(attrs).get('http-equiv') contentlanguage = dict(attrs).get('content') if httpequiv and contentlanguage: if httpequiv.lower() == 'content-language': language = contentlanguage.lower().strip()[:2] return language, description
[docs]def can_append(url, rel): """Check rel attrs to know if crawler can take this the link. Add !nofollow! at the end of the url if can't follow links of url. :param url: url to add :type url: str :param rel: rel attrs in a tag :type rel: str :return: None if can't add it, otherwise return url """ if url: if 'noindex' not in rel: if 'nofollow' in rel: url += '!nofollow!' return url else: return None else: return None
[docs]class ExtractEncoding(HTMLParser): """Html parser for extract encoding from source code.""" def __init__(self): HTMLParser.__init__(self) self.encoding = str()
[docs] def handle_starttag(self, tag, attrs): """Call when parser met a starting tag. :param tag: starting tag :type tag: str :param attrs: attributes :type attrs: list """ if tag == 'meta': # <meta charset="utf-8"> charset = dict(attrs).get('charset') if charset is not None: self.encoding = charset # <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> httpequiv = dict(attrs).get('http-equiv') content = dict(attrs).get('content') if httpequiv is not None and content is not None: if httpequiv.lower() == 'content-type': charset = content.find('charset') if charset != -1: self.encoding = content[charset+8:]