diff options
author | jbjjbjjbj <julianteule@gmail.com> | 2016-12-21 22:12:47 +0100 |
---|---|---|
committer | jbjjbjjbj <julianteule@gmail.com> | 2016-12-21 22:12:47 +0100 |
commit | 8d5f747e4f69c1093306a3124c42ca17392dc17c (patch) | |
tree | bdb4b2c1f0fd4e93ef37f0632f0cd8c697e40059 | |
parent | 582b827c0ec672b409dd214d43fa83018352c729 (diff) |
Wiki2P
-rw-r--r-- | Scripts/wiki2P.py | 27 |
1 files changed, 18 insertions, 9 deletions
diff --git a/Scripts/wiki2P.py b/Scripts/wiki2P.py index f0ded01..84f4d82 100644 --- a/Scripts/wiki2P.py +++ b/Scripts/wiki2P.py @@ -1,16 +1,25 @@ -import requests, bs4 +import requests, bs4, re -#res = requests.get("https://en.wikipedia.org/wiki/Special:Random") -res = requests.get("https://en.wikipedia.org/wiki/Study") +# res = requests.get("https://en.wikipedia.org/wiki/Special:Random") +res = requests.get("https://en.wikipedia.org/wiki/Linux") -soup = bs4.BeautifulSoup(res.text) +soup = bs4.BeautifulSoup(res.text, "html.parser") -element = soup.select("p > a") +element = soup.select("#mw-content-text a[title]") + + +pattern = re.compile("^\/.*") + + +for i in element: + if "Edit section" not in i["title"] and pattern.match(i["href"]): + if "div" not in str(i.parent) and "th" not in str(i.parent) and "td" not in str(i.parent): + try: + i["class"] + except KeyError: + print(i) + break -print(element[0]) -#while( soup.select(".firstHeading")[0] != "Philosophy"): -# -# print(soup.select(".firstHeading")[0].text) |