summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjbjjbjjbj <julianteule@gmail.com>2016-12-21 22:12:47 +0100
committerjbjjbjjbj <julianteule@gmail.com>2016-12-21 22:12:47 +0100
commit8d5f747e4f69c1093306a3124c42ca17392dc17c (patch)
treebdb4b2c1f0fd4e93ef37f0632f0cd8c697e40059
parent582b827c0ec672b409dd214d43fa83018352c729 (diff)
Wiki2P
-rw-r--r--Scripts/wiki2P.py27
1 files changed, 18 insertions, 9 deletions
diff --git a/Scripts/wiki2P.py b/Scripts/wiki2P.py
index f0ded01..84f4d82 100644
--- a/Scripts/wiki2P.py
+++ b/Scripts/wiki2P.py
@@ -1,16 +1,25 @@
-import requests, bs4
+import requests, bs4, re
-#res = requests.get("https://en.wikipedia.org/wiki/Special:Random")
-res = requests.get("https://en.wikipedia.org/wiki/Study")
+# res = requests.get("https://en.wikipedia.org/wiki/Special:Random")
+res = requests.get("https://en.wikipedia.org/wiki/Linux")
-soup = bs4.BeautifulSoup(res.text)
+soup = bs4.BeautifulSoup(res.text, "html.parser")
-element = soup.select("p > a")
+element = soup.select("#mw-content-text a[title]")
+
+
+pattern = re.compile("^\/.*")
+
+
+for i in element:
+ if "Edit section" not in i["title"] and pattern.match(i["href"]):
+ if "div" not in str(i.parent) and "th" not in str(i.parent) and "td" not in str(i.parent):
+ try:
+ i["class"]
+ except KeyError:
+ print(i)
+ break
-print(element[0])
-#while( soup.select(".firstHeading")[0] != "Philosophy"):
-#
-# print(soup.select(".firstHeading")[0].text)