From 8d5f747e4f69c1093306a3124c42ca17392dc17c Mon Sep 17 00:00:00 2001 From: jbjjbjjbj Date: Wed, 21 Dec 2016 22:12:47 +0100 Subject: Wiki2P --- Scripts/wiki2P.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'Scripts/wiki2P.py') diff --git a/Scripts/wiki2P.py b/Scripts/wiki2P.py index f0ded01..84f4d82 100644 --- a/Scripts/wiki2P.py +++ b/Scripts/wiki2P.py @@ -1,16 +1,25 @@ -import requests, bs4 +import requests, bs4, re -#res = requests.get("https://en.wikipedia.org/wiki/Special:Random") -res = requests.get("https://en.wikipedia.org/wiki/Study") +# res = requests.get("https://en.wikipedia.org/wiki/Special:Random") +res = requests.get("https://en.wikipedia.org/wiki/Linux") -soup = bs4.BeautifulSoup(res.text) +soup = bs4.BeautifulSoup(res.text, "html.parser") -element = soup.select("p > a") +element = soup.select("#mw-content-text a[title]") + + +pattern = re.compile("^\/.*") + + +for i in element: + if "Edit section" not in i["title"] and pattern.match(i["href"]): + if "div" not in str(i.parent) and "th" not in str(i.parent) and "td" not in str(i.parent): + try: + i["class"] + except KeyError: + print(i) + break -print(element[0]) -#while( soup.select(".firstHeading")[0] != "Philosophy"): -# -# print(soup.select(".firstHeading")[0].text) -- cgit v1.2.3