Source code for crawling.parsers
#!/usr/bin/python3
"""Data of webpage are geted by the python html.parser.
Here is two parser, the first one for all informations and
the sencond one only for encoding."""
from html.parser import HTMLParser
from html.entities import name2codepoint, html5
from swiftea_bot.data import LIST_TAG_WORDS, LIST_ALONE_TAG_WORDS
[docs]class ExtractData(HTMLParser):
"""Html parser for extract data.
self.object : the type of text for title, description and keywords\n
dict(attrs).get('content') : convert attrs in a dict and retrun the value
Data could be extract:
title\n
language\n
description\n
links with nofollow and noindex\n
stylesheet\n
favicon\n
keywords: h1, h2, h3, strong, em
"""
def __init__(self):
HTMLParser.__init__(self)
self.links = list() # List of links
self.keywords = '' # All keywords in a string
self.is_title = False # True is data are the title
self.word1 = False # True if data are words
self.word2 = False # True if data are words and tag is a tag use in and out other word tags
self.css = False # True if there is a css link in the source code
self.h1 = False # True if parsing the title of webpage
self.first_title = '' # The first title (h1) of the web site
self.description = self.language = self.title = self.favicon = ''
[docs] def re_init(self):
"""Call when met html tag, put back all variables to default."""
self.links = list()
self.first_title = self.keywords = self.description = ''
self.language = self.title = self.favicon = ''
self.css = self.h1 = False
self.is_title = False
self.word1 = False
self.word2 = False
[docs] def handle_starttag(self, tag, attrs):
"""Call when parser met a starting tag.
:param tag: starting tag
:type tag: str
:param attrs: attributes: [('name', 'language'), ('content', 'fr')]
:type attrs: list
"""
if tag =='html':
self.re_init()
if len(dict(attrs).get('lang', '')) >= 2:
self.language = dict(attrs).get('lang').lower().strip()[:2]
elif tag == 'a':
url = can_append(dict(attrs).get('href'), dict(attrs).get('rel', ''))
if url:
self.links.append(url)
elif tag == 'link':
rel = dict(attrs).get('rel', '')
if rel == 'stylesheet':
# LINK REL="STYLESHEET" TYPE="text/css"
self.css = True
elif rel == 'icon' or rel == 'shortcut icon':
# LINK REL="ICON" HREF="FAVICON.ICO"
self.favicon = dict(attrs).get('href', '')
elif tag == 'meta':
language, description = meta(attrs)
if language != str():
self.language = language
if description != str():
self.description = description
elif tag == 'title':
self.is_title = True # It's about title
if tag in LIST_TAG_WORDS:
self.word1 = True
if tag in LIST_ALONE_TAG_WORDS: # tag use in and out of tag from LIST_TAG_WORDS
self.word2 = True
if tag == 'h1' and self.first_title == '':
self.h1 = True # It's about a h1
[docs] def handle_data(self, data):
"""Call when parser met data.
:param tag: starting tag
:type tag: str
"""
if self.is_title:
self.title += data
if self.word1 or self.word2:
self.keywords += ' ' + data
if self.h1:
self.first_title = data
[docs] def handle_endtag(self, tag):
"""Call when parser met a ending tag.
:param tag: starting tag
:type tag: str
:param attrs: attributes
:type attrs: list
"""
if tag == 'title':
self.is_title = False
if tag == 'h1':
self.h1 = False
if tag in LIST_TAG_WORDS:
self.word1 = False
if tag in LIST_ALONE_TAG_WORDS: # tag use in and out of tag from LIST_TAG_WORDS
self.word2 = False
[docs] def handle_entityref(self, name):
try:
letter = chr(name2codepoint[name])
except KeyError:
try:
letter = html5[name + ';']
except KeyError:
pass
else:
if self.is_title:
self.title += letter
[docs] def handle_charref(self, name):
if name.startswith('x'):
letter = chr(int(name[1:], 16))
else:
letter = chr(int(name))
if self.is_title:
self.title += letter
[docs]def meta(attrs):
"""Manager searches in meat tag.
Can find:
<meta name='description' content='my description'/>\n
<meta name='language' content='en'/>\n
<meta http-equiv='content-language' content='en'/>\n
:apram attrs: attributes of meta tag
:type attrs: list
:return: language, description, object
"""
description = str()
language = str()
name = dict(attrs).get('name', '').lower()
content = dict(attrs).get('content')
if content:
if name == 'description':
description = content
elif name == 'language':
language = content.lower().strip()[:2]
httpequiv = dict(attrs).get('http-equiv')
contentlanguage = dict(attrs).get('content')
if httpequiv and contentlanguage:
if httpequiv.lower() == 'content-language':
language = contentlanguage.lower().strip()[:2]
return language, description
[docs]def can_append(url, rel):
"""Check rel attrs to know if crawler can take this the link.
Add !nofollow! at the end of the url if can't follow links of url.
:param url: url to add
:type url: str
:param rel: rel attrs in a tag
:type rel: str
:return: None if can't add it, otherwise return url
"""
if url:
if 'noindex' not in rel:
if 'nofollow' in rel:
url += '!nofollow!'
return url
else:
return None
else:
return None
[docs]class ExtractEncoding(HTMLParser):
"""Html parser for extract encoding from source code."""
def __init__(self):
HTMLParser.__init__(self)
self.encoding = str()
[docs] def handle_starttag(self, tag, attrs):
"""Call when parser met a starting tag.
:param tag: starting tag
:type tag: str
:param attrs: attributes
:type attrs: list
"""
if tag == 'meta':
# <meta charset="utf-8">
charset = dict(attrs).get('charset')
if charset is not None:
self.encoding = charset
# <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
httpequiv = dict(attrs).get('http-equiv')
content = dict(attrs).get('content')
if httpequiv is not None and content is not None:
if httpequiv.lower() == 'content-type':
charset = content.find('charset')
if charset != -1:
self.encoding = content[charset+8:]