#!/usr/bin/python3
import requests as req
from reppy.cache import RobotsCache
from swiftea_bot.data import HEADERS
from crawling.connection import *
from crawling.searches import *
from crawling.web_connection import WebConnection
from crawling.site_informations import SiteInformations
from crawling.parsers import *
import tests.test_data as test_data
[docs]class CrawlingBaseTest(object):
"""Base class for all crawler test classes."""
[docs] def setup_method(self, _):
"""Configure the app."""
self.url = "http://aetfiws.ovh"
self.code1 = test_data.CODE1
self.code2 = test_data.CODE2
self.code3 = test_data.CODE3
self.parser = ExtractData()
self.parser_encoding = ExtractEncoding()
self.STOPWORDS = {'fr':('mot', 'pour', 'de')}
self.BADWORDS = {'fr': ('pipe', 'xxx')}
self.is_title = True
self.title = 'letter'
self.headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'vary': 'X-PJAX, Accept-Encoding'}
self.reqrobots = RobotsCache(capacity=100)
[docs]class TestConnection(CrawlingBaseTest):
[docs] def test_check_connection(self):
assert check_connection(self.url) == False
assert check_connection() == True
[docs] def test_is_nofollow(self):
nofollow, url = is_nofollow(self.url + '!nofollow!')
assert nofollow == True
assert url == self.url
nofollow, url = is_nofollow(self.url)
assert nofollow == False
assert url == self.url
[docs] def test_duplicate_content(self):
assert duplicate_content('un premier code', 'un deuxieme code') == True
assert duplicate_content('un premier code un peu plus grand', 'un deuxieme code') == False
[docs] def test_all_urls(self):
request = req.get("https://fr.wikipedia.org")
assert all_urls(request) == ["https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Accueil_principal", "https://fr.wikipedia.org"]
request = req.get("https://choosealicense.com/")
assert all_urls(request) == ["https://choosealicense.com"]
[docs]class TestWebConnection(CrawlingBaseTest):
[docs] def test_search_encoding(self):
assert WebConnection.search_encoding(self, {}, self.code3) == ('utf-8', 0)
assert WebConnection.search_encoding(self, self.headers, self.code3) == ('utf-8', 1)
assert WebConnection.search_encoding(self, {}, self.code1) == ('utf-8', 1)
assert WebConnection.search_encoding(self, {}, self.code2) == ('UTF-16 LE', 1)
[docs] def test_check_robots_perm(self):
assert WebConnection.check_robots_perm(self, 'https://zestedesavoir.com') == True
assert WebConnection.check_robots_perm(self, 'https://www.facebook.com') == False
assert WebConnection.check_robots_perm(self, self.url) == False
assert WebConnection.check_robots_perm(self, 'http://premium.lefigaro.fr') == True
[docs] def test_send_request(self):
WebConnection.send_request(self, 'https://zestedesavoir.com')
assert WebConnection.send_request(self, 'https://uneurlbidon.com') == None
[docs] def test_duplicate_content(self):
request = req.get('https://zestedesavoir.com')
WebConnection.duplicate_content(self, request, 'https://zestedesavoir.com')
[docs]class TestSearches(CrawlingBaseTest):
[docs] def test_clean_text(self):
text = clean_text('Sample text with non-desired \r whitespaces \t chars \n')
assert '\n' not in text and '\r' not in text and '\t' not in text
[docs] def test_get_base_url(self):
assert get_base_url(self.url + '/page1.php') == self.url
[docs] def test_is_homepage(self):
assert is_homepage('http://www.bfmtv.com') == True
assert is_homepage('http://www.bfmtv.com/page.html') == False
assert is_homepage('https://github.com') == True
assert is_homepage('http://bfmbusiness.bfmtv.com') == False
[docs] def test_capitalize(self):
assert capitalize('ceci est un Titre') == 'Ceci est un Titre'
assert capitalize('') == ''
[docs] def test_clean_link(self):
assert clean_link('http://www.example.fr?w=word#big_title') == 'http://www.example.fr?w=word'
[docs] def test_stats_links(self):
stats_links(50)
[docs]class TestParsers(CrawlingBaseTest):
[docs] def test_can_append(self):
assert can_append('about/ninf.php', 'noindex, nofollow') == None
assert can_append('about/ninf.php', 'nofollow') == 'about/ninf.php!nofollow!'
assert can_append('about/ninf.php', '') == 'about/ninf.php'
assert can_append(None, '') is None
[docs] def test_handle_entityref(self):
ExtractData.handle_entityref(self, 'eacute')
assert self.title == 'letteré'
ExtractData.handle_entityref(self, 'agrave')
assert self.title == 'letteréà'
[docs] def test_handle_charref(self):
pass
[docs] def test_parser(self):
self.parser.feed(self.code1)
assert self.parser.links == ['demo', 'index', 'about/nf.php!nofollow!']
assert clean_text(self.parser.first_title) == 'Gros titre'
keywords = 'une CSS Demo ici! Gros titre Moyen titre petit titre strong em Why use Swiftea ?1 Why use Swiftea ?2 Why use Swiftea ?3 © >'
assert clean_text(self.parser.keywords) == keywords
assert self.parser.css == True
assert self.parser.description == 'Moteur de recherche'
assert self.parser.language == 'en'
assert self.parser.favicon == 'public/favicon.ico'
assert self.parser.title == 'Swiftea'
self.parser.feed(self.code2)
assert self.parser.language == 'en'
assert self.parser.favicon == 'public/favicon2.ico'
self.parser.feed(self.code3)
assert self.parser.language == 'fr'
[docs] def test_parser_encoding(self):
self.parser_encoding.feed(self.code1)
assert self.parser_encoding.encoding == 'utf-8'
self.parser_encoding.feed(self.code2)
assert self.parser_encoding.encoding == 'UTF-16 LE'