Source code for crawling.searches

#!/usr/bin/python3

"""Define several functions SiteInformations."""

from re import compile as compile_regex
from urllib.parse import urlparse

from swiftea_bot.data import BAD_EXTENTIONS, DIR_STATS


regex = compile_regex(r'(\w+|\d+)')

[docs]def clean_text(text): """Clean up text by removing tabulations, blanks and carriage returns. :param text: text to clean_text :type text: str :return: cleaned text """ return ' '.join(text.split())
[docs]def get_base_url(url): """Get base url using urlparse. :param url: url :type url: str :return: base url of given url """ infos_url = urlparse(url) base_url = infos_url.scheme + '://' + infos_url.netloc return base_url
[docs]def is_homepage(url): """Check if url is the homepage. If there is only two '/' and two '.' if www and one otherwise. :param url: url to check :type url: str :return: True or False """ if url.count('/') == 2: if '//www.' in url and url.count('.') == 2: return True elif url.count('.') == 1: return True else: return False else: return False
[docs]def capitalize(text): """Upper the first letter of given text :param text: text :type text: str :return: text """ if len(text) > 0: return text[0].upper() + text[1:] else: return ''