Source code for crawling.site_informations
#!/usr/bin/python3
"""After parse source code, data extracted must be classify and clean.
Here is a class who use the html parser and manage all results."""
from urllib.parse import urlparse
from swiftea_bot.module import tell, remove_duplicates
from crawling import parsers, searches
[docs]class SiteInformations(object):
"""Class to manage searches in source codes."""
def __init__(self):
"""Build searches manager."""
self.parser = parsers.ExtractData()
[docs] def set_listswords(self, stopwords, badwords):
self.STOPWORDS = stopwords
self.BADWORDS = badwords
[docs] def get_infos(self, url, code, nofollow, score):
"""Manager all searches of webpage's informations.
:param url: url of webpage
:type url: str
:param score: score of webpage
:type score: int
:param code: source code of webpage
:type code: str
:param nofollow: if we take links of webpage
:type nofollow: bool
:return: links, title, description, key words, language,
score, number of words
"""
results = dict()
results['homepage'] = 1 if searches.is_homepage(url) else 0
self.parser.feed(code)
results['title'] = searches.clean_text(searches.capitalize(self.parser.title)) # Find title and clean it
keywords = searches.clean_text(self.parser.keywords.lower()).split()
# Language:
if self.parser.language != '':
language = self.parser.language
score += 1
else:
language = self.detect_language(keywords)
if language in self.STOPWORDS and self.parser.title != '':
keywords = self.clean_keywords(keywords, language)
keywords.extend(self.clean_keywords(results['title'].lower().split(), language))
infos_url = urlparse(url)
path_position = infos_url.path.rfind('.')
path = infos_url.path[:path_position]
keywords.extend(self.clean_keywords(path, language))
results['sanesearch'] = self.sane_search(keywords, language)
results['language'] = language
results['keywords'] = keywords
# Description:
if self.parser.description == '':
results['description'] = searches.clean_text(searches.capitalize(self.parser.first_title))
else:
results['description'] = searches.clean_text(searches.capitalize(self.parser.description))
# Css:
if self.parser.css:
score += 1
base_url = searches.get_base_url(url)
# Links:
if nofollow:
links = list()
else:
links = self.clean_links(self.parser.links, base_url)
searches.stats_links(len(links))
if self.parser.favicon != '':
results['favicon'] = self.clean_favicon(self.parser.favicon, base_url)
else:
results['favicon'] = ''
else:
tell('No language or title', severity=-1)
results = {'title': ''}
links = list()
results['score'] = score
return results, links
[docs] def detect_language(self, keywords):
"""Detect language of webpage if not given.
:param keywords: keywords of webpage used for detecting
:type keywords: list
:return: language found
"""
total_stopwords = 0
# Number stopwords
nb_stopwords = dict()
for lang in self.STOPWORDS:
nb_stopwords[lang] = 0
for keyword in keywords:
if keyword in self.STOPWORDS[lang]:
total_stopwords += 1
nb_stopwords[lang] += 1
if total_stopwords != 0:
language = max(nb_stopwords, key=nb_stopwords.get)
else:
language = ''
return language
[docs] def clean_links(self, links, base_url=None):
"""Clean webpage's links: rebuild urls with base url and
remove anchors, mailto, javascript, .index.
:param links: links to clean
:type links: list
:return: cleanen links without duplicate
"""
links = remove_duplicates(links)
new_links = list()
for url in links:
new_url = searches.clean_link(url, base_url)
if new_url:
new_links.append(new_url)
return remove_duplicates(new_links)
[docs] def clean_favicon(self, favicon, base_url):
"""Clean favicon.
:param favicon: favicon url to clean
:type favicon: str
:return: cleaned favicon
"""
if not favicon.startswith('http') and not favicon.startswith('www'):
if favicon.startswith('//'):
favicon = 'http:' + favicon
elif favicon.startswith('/'):
favicon = base_url + favicon
else:
favicon = base_url + '/' + favicon
return favicon
[docs] def clean_keywords(self, dirty_keywords, language):
"""Clean found keywords.
Delete stopwords, bad chars, two letter less word and split word1-word2
:param keywords: keywords to clean
:type keywords: list
:return: list of cleaned keywords
"""
stopwords = self.STOPWORDS[language]
cleaned_keywords = list()
half_cleaned_keywords = list() # cleaning with regex (with '_')
for keyword in dirty_keywords:
half_cleaned_keywords.extend(searches.regex.findall(keyword))
new_keywords = list() # Without '_'
for keyword in half_cleaned_keywords:
new_keywords.extend(keyword.split('_'))
for keyword in new_keywords:
if keyword not in stopwords and len(keyword) > 1:
cleaned_keywords.append(keyword)
return cleaned_keywords
[docs] def sane_search(self, keywords, language, max_ratio=.2):
"""Filter adults websites.
:param: keywords: webpage's keywords
:type keywords: list
:pram language: found website language
:type language: str
:return: True or False
"""
badwords = self.BADWORDS[language]
nb_badwords = 0
nb_words = len(keywords)
if nb_words == 0:
return False
for keyword in keywords:
if keyword in badwords:
nb_badwords += 1
ratio = nb_badwords / nb_words
if ratio >= max_ratio:
tell('bad site detected')
return True
else:
return False