Ce script fait partie des projets Python de HyperSkill.
j'ai fait toute la formation python de Hyperskill pendant le Confinement 2020.
import sys | |
import os | |
from collections import deque | |
from bs4 import BeautifulSoup | |
import requests | |
from colorama import Fore | |
args = sys.argv | |
folder = args[1] # "Yolo" | |
history = deque() | |
list_tag = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "a", "ul", "ol", "li"] | |
def validate_url(_url): | |
if "https://" in _url: | |
return _url, _url.strip("https://").split(".")[0] | |
else: | |
if len(_url.split(".")) != 1: | |
return f"https://{_url}", _url.split(".")[0] | |
return False, None | |
print(Fore.BLUE) | |
while True: | |
if not os.path.exists(folder): | |
os.mkdir(folder) | |
user_input = input("Url ?") | |
if user_input == "exit": | |
exit(0) | |
if user_input == "back": | |
history.pop() | |
user_input = history[-1] | |
url, filename = validate_url(user_input) | |
print(url, filename) | |
if url: | |
history.append(url) | |
req = requests.get(url) | |
with open(f"{folder}/{filename}", "w") as file: | |
raw_text = req.content | |
soup = BeautifulSoup(raw_text, "html.parser") | |
text = [] | |
for tags in soup.findAll(list_tag): | |
if tags.get_text() != "\n": | |
raw = tags.get_text().replace("\t", "").replace("\r", "").replace("\n", "") | |
text.append(raw) | |
file.write("\n".join(text)) | |
print("\n".join(text)) | |
else: | |
if os.path.exists(f"{folder}/{user_input}"): | |
with open(f"{folder}/{user_input}", "r") as file: | |
print(file.read()) | |
else: | |
print("Error: Incorrect URL") |
cyvax - 2024