Source code for crawling.parsers

#!/usr/bin/python3

"""Data of webpage is provided by the python html.parser.
There are two parsers: the first one for all informations and
the second one only for encoding."""

from html.parser import HTMLParser
from html.entities import name2codepoint, html5

from swiftea_bot.data import LIST_TAG_WORDS, LIST_ALONE_TAG_WORDS


[docs]class ExtractData(HTMLParser):
	"""Html parser to extract data.

	self.object: the type of text for title, description and keywords\n
	dict(attrs).get('content'): convert attrs in a dict and return the value

	Data that could be extracted:
		title\n
		language\n
		description\n
		links with nofollow and noindex\n
		stylesheet\n
		favicon\n
		keywords: h1, h2, h3, strong, em

	"""
	def __init__(self):
		HTMLParser.__init__(self)
		self.links = list()  # List of links
		self.keywords = ''  # All keywords in a string
		self.is_title = False  # True is data is the title
		self.word1 = False  # True if data are words
		self.word2 = False  # True if data are words and tag is a tag use in and out other word tags
		self.css = False  # True if there is a css link in the source code
		self.h1 = False  # True if parsing the title of webpage
		self.first_title = ''  # The first title (h1) of the web site
		self.description = self.language = self.title = self.favicon  = ''

[docs]	def re_init(self):
		"""Called when we meet html tag, put back all variables to default."""
		self.links = list()
		self.first_title = self.keywords = self.description = ''
		self.language = self.title = self.favicon  = ''
		self.css = self.h1 = False
		self.is_title = False
		self.word1 = False
		self.word2 = False

[docs]	def handle_starttag(self, tag, attrs):
		"""Called when parser meet a starting tag.

		:param tag: starting tag
		:type tag: str
		:param attrs: attributes: [('name', 'language'), ('content', 'fr')]
		:type attrs: list

		"""
		if tag =='html':
			self.re_init()
			if len(dict(attrs).get('lang', '')) >= 2:
				self.language = dict(attrs).get('lang').lower().strip()[:2]

		elif tag == 'a':
			url = can_append(dict(attrs).get('href'), dict(attrs).get('rel', ''))
			if url:
				self.links.append(url)

		elif tag == 'link':
			rel = dict(attrs).get('rel', '')
			if rel == 'stylesheet':
				# LINK REL="STYLESHEET" TYPE="text/css"
				self.css = True
			elif rel == 'icon' or rel == 'shortcut icon':
				# LINK REL="ICON" HREF="FAVICON.ICO"
				self.favicon = dict(attrs).get('href', '')

		elif tag == 'meta':
			language, description = meta(attrs)
			if language != str():
				self.language = language
			if description != str():
				self.description = description


		elif tag == 'title':
			self.is_title = True  # It's about title

		if tag in LIST_TAG_WORDS:
			self.word1 = True

		if tag in LIST_ALONE_TAG_WORDS:  # tag use in and out of tag from LIST_TAG_WORDS
			self.word2 = True

		if tag == 'h1' and self.first_title == '':
			self.h1 = True  # It's about a h1

[docs]	def handle_data(self, data):
		"""Called when parser meet data.

		:param tag: starting tag
		:type tag: str

		"""
		if self.is_title:
			self.title += data

		if self.word1 or self.word2:
			self.keywords += ' ' + data

		if self.h1:
			self.first_title = data

[docs]	def handle_endtag(self, tag):
		"""Called when parser meet an ending tag.

		:param tag: starting tag
		:type tag: str
		:param attrs: attributes
		:type attrs: list

		"""
		if tag == 'title':
			self.is_title = False

		if tag == 'h1':
			self.h1 = False

		if tag in LIST_TAG_WORDS:
			self.word1 = False

		if tag in LIST_ALONE_TAG_WORDS:  # tag use in and out of tag from LIST_TAG_WORDS
			self.word2 = False

[docs]	def handle_entityref(self, name):
		try:
			letter = chr(name2codepoint[name])
		except KeyError:
			try:
				letter = html5[name + ';']
			except KeyError:
				pass
		else:
			if self.is_title:
				self.title += letter

[docs]	def handle_charref(self, name):
		if name.startswith('x'):
			letter = chr(int(name[1:], 16))
		else:
			letter = chr(int(name))
		if self.is_title:
			self.title += letter


[docs]def meta(attrs):
	"""Manage searches in tags.

	We can find:
		<meta name='description' content='my description'/>\n
		<meta name='language' content='en'/>\n
		<meta http-equiv='content-language' content='en'/>\n

	:apram attrs: attributes of meta tag
	:type attrs: list
	:return: language, description, object

	"""
	description = str()
	language = str()
	name = dict(attrs).get('name', '').lower()
	content = dict(attrs).get('content')
	if content:
		if name == 'description':
			description = content
		elif name == 'language':
			language = content.lower().strip()[:2]

	httpequiv = dict(attrs).get('http-equiv')
	contentlanguage = dict(attrs).get('content')
	if httpequiv and contentlanguage:
		if httpequiv.lower() == 'content-language':
			language = contentlanguage.lower().strip()[:2]

	return language, description


[docs]def can_append(url, rel):
	"""Check rel attrs to know if crawler can crawl the link.

	Add !nofollow! at the end of the url if it can't follow links of url.

	:param url: url to add
	:type url: str
	:param rel: rel attrs in a tag
	:type rel: str
	:return: None if it can't add it, otherwise return url

	"""
	if url:
		if 'noindex' not in rel:
			if 'nofollow' in rel:
				url += '!nofollow!'
			return url
		else:
			return None
	else:
		return None


[docs]class ExtractEncoding(HTMLParser):
	"""Html parser to extract encoding from source code."""
	def __init__(self):
		HTMLParser.__init__(self)
		self.encoding = str()

[docs]	def handle_starttag(self, tag, attrs):
		"""Called when parser meet a starting tag.

		:param tag: starting tag
		:type tag: str
		:param attrs: attributes
		:type attrs: list

		"""
		if tag == 'meta':
			# <meta charset="utf-8">
			charset = dict(attrs).get('charset')
			if charset is not None:
				self.encoding = charset
			# <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
			httpequiv = dict(attrs).get('http-equiv')
			content = dict(attrs).get('content')
			if httpequiv is not None and content is not None:
				if httpequiv.lower() == 'content-type':
					charset = content.find('charset')
					if charset != -1:
						self.encoding = content[charset+8:]