diff options
Diffstat (limited to 'Scripts/wiki2P.py')
-rw-r--r-- | Scripts/wiki2P.py | 47 |
1 files changed, 34 insertions, 13 deletions
diff --git a/Scripts/wiki2P.py b/Scripts/wiki2P.py index 84f4d82..316c7e9 100644 --- a/Scripts/wiki2P.py +++ b/Scripts/wiki2P.py @@ -1,25 +1,46 @@ import requests, bs4, re -# res = requests.get("https://en.wikipedia.org/wiki/Special:Random") -res = requests.get("https://en.wikipedia.org/wiki/Linux") -soup = bs4.BeautifulSoup(res.text, "html.parser") +def calculate(site): -element = soup.select("#mw-content-text a[title]") + print("Downloading wikipedia site: " + site) -pattern = re.compile("^\/.*") + res = requests.get("https://en.wikipedia.org" + site) + print("Download completed analysing") -for i in element: - if "Edit section" not in i["title"] and pattern.match(i["href"]): - if "div" not in str(i.parent) and "th" not in str(i.parent) and "td" not in str(i.parent): - try: - i["class"] - except KeyError: - print(i) - break + soup = bs4.BeautifulSoup(res.text, "html.parser") + element = soup.find("div", {"class": "mw-content-ltr"}).find("p") + + + #print(element) + + elements = element.find_all("a") + + + + + pattern = re.compile("\/wiki\/(?!File|Help).*") + + results = "" + + + for i in elements: + if pattern.match(i["href"]) : + results = i["href"] + break + + return results + + + +siter = "/wiki/Linux" + +while siter is not "philosophy": + input(siter) + siter = calculate(siter) |