Source code for database.database_swiftea


from datetime import datetime

from swiftea_bot.module import tell
from import CRAWL_DELAY
from database.database_manager import DatabaseManager
import database.database as database

[docs]class DatabaseSwiftea(DatabaseManager): """Class to manage Swiftea database. :param host: hostname of the sftp server :type host: str :param user: username to use for connection :type user: str :param password: password to use for connection :type password: str :param name: name of database :type name: str """ def __init__(self, host, user, password, name, table): DatabaseManager.__init__(self, host, user, password, name) self.t = table
[docs] def send_doc(self, webpage_infos): """send documents informations to database. :param infos: informations to send to database :type infos: list :return: True if an error occured """ error = False # False: no error response = self.connection() result, response = self.send_command( "SELECT popularity, last_crawl FROM {} WHERE url = %s".format(self.t), (webpage_infos['url']), True) if 'error' in response: tell('Popularity and last_crawl query failed: ' + response, 16) error = True if result != (): # Url found in database, there is a answer: last_crawl = result[0][1] # datetime.datetime object if ( - last_crawl) > CRAWL_DELAY: # The program already crawled this website response = self.update(webpage_infos, result[0][0]+1) if error: error = True else: tell('Recently crawled: ' + webpage_infos['url']) else: # Url not found in database, the url don't exists in the database, # we add it: error = self.insert(webpage_infos) if error: error = True self.close_connection() return error # All is correct
[docs] def update(self, infos, popularity): """Update a document in database. :param infos: doc infos :type infos: dict() :param popularity: new doc popularity :type popularity: int :return: True is an arror occured """ tell('Updating ' + infos['url']) cmd = """ UPDATE {} SET title=%s, description=%s, last_crawl=NOW(), language=%s, popularity=%s, score=%s, homepage=%s, sanesearch=%s, favicon=%s WHERE url = %s""".format(self.t) response = self.send_command(cmd, (infos['title'], infos['description'], infos['language'], popularity, infos['score'], infos['homepage'], infos['sanesearch'], infos['favicon'], infos['url']) ) if 'error' in response[1]: tell('Failed to update: ' + response[1], 9) return True else: return False
[docs] def insert(self, infos): """Insert a new document in database. :param infos: doc infos :type infos: dict() :return: True is an arror occured """ tell('Adding ' + infos['url']) response = self.send_command( """INSERT INTO {} (title, description, url, first_crawl, last_crawl, language, popularity, score, homepage, sanesearch, favicon) VALUES (%s, %s, %s, NOW(), NOW(), %s, 1, %s, %s, %s, %s)""".format(self.t), (infos['title'], infos['description'], infos['url'], infos['language'], infos['score'], infos['homepage'], infos['sanesearch'], infos['favicon']) ) if 'error' in response[1][1]: tell('Failed to add: ' + str(response), 10) return True else: return False
[docs] def get_doc_id(self, url): """Get id of a document in database. :param url: url of webpage :type url: str :return: id of webpage or None if not found """ result, response = self.send_command("SELECT id FROM {} WHERE url = %s".format(self.t), (url)) if 'error' in response[1]: tell('Failed to get id: ' + response, 11) return None else: return str(result[0])
[docs] def del_one_doc(self, url, table=None): """Delete document corresponding to url. :param url: url of webpage :type url: str :return: status message """ if table is None: table = self.t tell('Delete from {} doc: {}'.format(self.t, url)) response = self.send_command("DELETE FROM {} WHERE url = %s".format(table), (url)) if 'error' in response[1]: tell('Doc not removed: {0}, {1}'.format(url, response[1]), 12) return response[1]
[docs] def suggestions(self): """Get the five first URLs from Suggestion table and delete them. :return: list of url in Suggestion table and delete them """ result, response = self.send_command("SELECT url FROM suggestion LIMIT 5", fetchall=True) if 'error' in response[1]: tell('Failed to get url: ' + response, 13) return None else: suggested_links = list() for element in result: if len(suggested_links) < 5: suggested_links.append(element[0]) self.del_one_doc(element[0], 'suggestions') return suggested_links
[docs] def doc_exists(self, url): """Check if `url` is in database. :param url: url corresponding to doc :type url: str :return: True if doc exists """ result, response = self.send_command("SELECT EXISTS(SELECT * FROM {} WHERE url=%s)".format(self.t), (url)) if 'error' in response: tell('Failed to check row: ' + response, 14) return None if result[0] == 1: return True else: return False
[docs] def https_duplicate(self, old_url): """Avoid https and http duplicate. If old url is secure (https), must delete insecure url if exists, then return secure url (old url). If old url is insecure (http), must delete it if secure url exists, then return secure url (new url) :param old_url: old url :type old_url: str :return: url to add and url to delete """ tell('url to send: ' + old_url, severity=-1) new_url = database.convert_secure(old_url) new_exists = self.doc_exists(new_url) if database.url_is_secure(old_url): # old_url start with https if new_exists: # Start with http return old_url, new_url else: return old_url, None else: # old_url is insecure, start with http if new_exists: # Secure url exists if self.doc_exists(old_url): # Insecure exists return new_url, old_url else: return new_url, None else: return old_url, None