#add missing packages to kernel
import sys
!{sys.executable} -m pip install wiki_dump_reader
Collecting wiki_dump_reader Downloading wiki-dump-reader-0.0.4.tar.gz (3.4 kB) Preparing metadata (setup.py) ... done Building wheels for collected packages: wiki_dump_reader Building wheel for wiki_dump_reader (setup.py) ... done Created wheel for wiki_dump_reader: filename=wiki_dump_reader-0.0.4-py3-none-any.whl size=4000 sha256=5fd0c0f1e7ab187de9e945945db0ecc0709a06413a5e07d66f6136b1c90ba859 Stored in directory: /Users/alialvarez/Library/Caches/pip/wheels/72/7b/e1/2b758cc4aa080655ef16ed4f759232491118494fe54ce003d2 Successfully built wiki_dump_reader Installing collected packages: wiki_dump_reader Successfully installed wiki_dump_reader-0.0.4
import pandas as pd
import numpy as np
import os
# targeted files
url= 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2'
filename=os.path.join(os.getcwd(),'enwiki-latest-pages-articles.xml.bz2')
location=os.getcwd()
logfile=os.path.join(os.getcwd(),"log.txt")
#truncate log file
f = open(logfile, "a")
f.truncate(0)
f.close()
import subprocess
def runcmd(cmd, verbose = False, *args, **kwargs):
process = subprocess.Popen(
cmd,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE,
text = True,
shell = True
)
std_out, std_err = process.communicate()
if verbose:
print(std_out.strip(), std_err)
pass
try: os.remove(filename) except OSError: pass
runcmd("wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2", verbose = True)
#create backup of bz2 file
try:
os.remove(filename+'.backup')
except OSError:
pass
runcmd("cp enwiki-latest-pages-articles.xml enwiki-latest-pages-articles.xml.bz2.backup", verbose = True)
# decompress file using #wikiextractor library
runcmd("bzip2 -d /Users/alialvarez/Desktop/STUDIES/github/code_library/wikipedia/enwiki-latest-pages-articles.xml.bz2", verbose = True)
Extracting and cleaning enwiki-latest-pages-articles.xml.bz2 to enwiki-latest-pages-articles.txt... Traceback (most recent call last): File "/Users/alialvarez/opt/anaconda3/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/Users/alialvarez/opt/anaconda3/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/Users/alialvarez/opt/anaconda3/lib/python3.8/site-packages/wikiextractor/WikiExtractor.py", line 645, in <module> main() File "/Users/alialvarez/opt/anaconda3/lib/python3.8/site-packages/wikiextractor/WikiExtractor.py", line 640, in main process_dump(input_file, args.templates, output_path, file_size, File "/Users/alialvarez/opt/anaconda3/lib/python3.8/site-packages/wikiextractor/WikiExtractor.py", line 336, in process_dump templates = load_templates(input, template_file) File "/Users/alialvarez/opt/anaconda3/lib/python3.8/site-packages/wikiextractor/WikiExtractor.py", line 209, in load_templates for line in file: File "/Users/alialvarez/opt/anaconda3/lib/python3.8/bz2.py", line 195, in read1 return self._buffer.read1(size) File "/Users/alialvarez/opt/anaconda3/lib/python3.8/_compression.py", line 68, in readinto data = self.read(len(byte_view)) File "/Users/alialvarez/opt/anaconda3/lib/python3.8/_compression.py", line 99, in read raise EOFError("Compressed file ended before the " EOFError: Compressed file ended before the end-of-stream marker was reached
#create backup of xml file
try:
os.remove(filename[:-4]+'.backup')
except OSError:
pass
runcmd("cp enwiki-latest-pages-articles.xml enwiki-latest-pages-articles.xml.backup", verbose = True)
try:
os.remove(filename[:-8]+'.csv')
except OSError:
pass
import wiki_dump_parser as parser
parser.xml_to_csv('enwiki-latest-pages-articles.xml')
Processing... The following line has incomplete info and therefore it's been removed from the dataset: ['419020', '|Lăutari|', '0', '1083044321', '2022-04-16T17:29:47Z', '', '', '22291'] The following line has incomplete info and therefore it's been removed from the dataset: ['434551', '|TQ|', '0', '1054704638', '2021-11-11T17:04:12Z', '', '', '734'] The following line has incomplete info and therefore it's been removed from the dataset: ['560934', '|Glenmont station|', '0', '1083409549', '2022-04-18T17:23:03Z', '', '', '18431'] The following line has incomplete info and therefore it's been removed from the dataset: ['1035712', '|Counts per minute|', '0', '1044124399', '2021-09-13T17:59:22Z', '', '', '9060'] The following line has incomplete info and therefore it's been removed from the dataset: ['1404360', '|Rachel, Nevada|', '0', '1080720063', '2022-04-03T01:01:19Z', '', '', '20233'] The following line has incomplete info and therefore it's been removed from the dataset: ['2111177', '|Roddy McCorley|', '0', '1084082088', '2022-04-22T13:06:59Z', '', '', '8948'] The following line has incomplete info and therefore it's been removed from the dataset: ['2179981', '|Kisa|', '0', '1028950442', '2021-06-17T00:17:00Z', '', '', '1315'] The following line has incomplete info and therefore it's been removed from the dataset: ['2563072', '|Manfred Max-Neef|', '0', '1029083264', '2021-06-17T19:54:32Z', '', '', '7540'] The following line has incomplete info and therefore it's been removed from the dataset: ['3620303', '|Explosive detection|', '0', '1085225364', '2022-04-29T05:20:54Z', '', '', '17385'] The following line has incomplete info and therefore it's been removed from the dataset: ['4533642', '|Knightswood Secondary School|', '0', '1087935571', '2022-05-15T09:25:13Z', '', '', '9454'] The following line has incomplete info and therefore it's been removed from the dataset: ['4542806', '|Parthenolide|', '0', '1085884668', '2022-05-02T23:49:00Z', '', '', '8123'] The following line has incomplete info and therefore it's been removed from the dataset: ['4559576', '|West Iron County School District|', '0', '46060788', '2006-03-29T20:46:01Z', '', '', '38'] The following line has incomplete info and therefore it's been removed from the dataset: ['7671898', '|Érik Boisse|', '0', '1072124886', '2022-02-16T03:07:35Z', '', '', '3359'] The following line has incomplete info and therefore it's been removed from the dataset: ['9313935', '|Wheaton High School|', '0', '1083331061', '2022-04-18T08:09:09Z', '', '', '13691'] The following line has incomplete info and therefore it's been removed from the dataset: ['14313333', '|Climate of Kolkata|', '0', '1087943868', '2022-05-15T10:43:42Z', '', '', '4847'] The following line has incomplete info and therefore it's been removed from the dataset: ['15920386', '|Columbia University School of Professional Studies|', '0', '1078929895', '2022-03-24T02:29:42Z', '', '', '18050'] The following line has incomplete info and therefore it's been removed from the dataset: ['16081936', '|American-Mexican War|', '0', '964557027', '2020-06-26T06:41:28Z', '', '', '36'] The following line has incomplete info and therefore it's been removed from the dataset: ['18558983', '|Category:Wikipedians from the Upper Peninsula of Michigan|', '14', '350330051', '2010-03-17T02:53:02Z', '', '', '800'] The following line has incomplete info and therefore it's been removed from the dataset: ['18765770', '|Ongudaysky District|', '0', '1070117864', '2022-02-05T19:46:42Z', '', '', '4815'] The following line has incomplete info and therefore it's been removed from the dataset: ['19157709', '|Jean Baptiste Baudreau II|', '0', '1088717813', '2022-05-19T19:00:42Z', '', '', '5278'] The following line has incomplete info and therefore it's been removed from the dataset: ['20182708', '|Kathy Byron|', '0', '1088205724', '2022-05-16T18:51:31Z', '', '', '16252'] The following line has incomplete info and therefore it's been removed from the dataset: ['21620170', '|Romanian peasant music|', '0', '1083045505', '2022-04-16T17:38:48Z', '', '', '1548'] The following line has incomplete info and therefore it's been removed from the dataset: ['23814911', '|RIAA v. Joel Tenenbaum|', '0', '463033209', '2011-11-29T04:22:49Z', '', '', '55'] The following line has incomplete info and therefore it's been removed from the dataset: ['23816241', '|Joel Tenenbaum|', '0', '463033263', '2011-11-29T04:23:09Z', '', '', '55'] The following line has incomplete info and therefore it's been removed from the dataset: ['23875064', '|Reba Som|', '0', '1074073506', '2022-02-26T07:51:55Z', '', '', '3053'] The following line has incomplete info and therefore it's been removed from the dataset: ['24068657', '|RIAA vs. Joel Tenenbaum|', '0', '463033321', '2011-11-29T04:23:26Z', '', '', '55'] The following line has incomplete info and therefore it's been removed from the dataset: ['29299532', '|Brandun DeShay|', '0', '1088275954', '2022-05-17T03:21:39Z', '', '', '7628'] The following line has incomplete info and therefore it's been removed from the dataset: ['30634026', '|Joint Base Myer–Henderson Hall|', '0', '1083395911', '2022-04-18T16:05:59Z', '', '', '7609'] The following line has incomplete info and therefore it's been removed from the dataset: ['31003283', '|Dutch Burghers|', '0', '1080846425', '2022-04-03T20:21:48Z', '', '', '7582'] The following line has incomplete info and therefore it's been removed from the dataset: ['32956540', '|Highly emetogenic chemotherapy|', '0', '448226866', '2011-09-03T14:02:18Z', '', '', '54'] The following line has incomplete info and therefore it's been removed from the dataset: ['34992479', '|Wikipedia:Motto of the day/March 8, 2012|', '4', '480626041', '2012-03-07T08:01:23Z', '', '', '123'] The following line has incomplete info and therefore it's been removed from the dataset: ['34992497', '|Wikipedia:Motto of the day/March 7, 2012|', '4', '480626202', '2012-03-07T08:02:53Z', '', '', '33'] The following line has incomplete info and therefore it's been removed from the dataset: ['35002660', '|Wikipedia:Motto of the day/March 15, 2012|', '4', '480798715', '2012-03-08T07:16:40Z', '', '', '138'] The following line has incomplete info and therefore it's been removed from the dataset: ['35006261', '|Wikipedia:Motto of the day/March 16, 2012|', '4', '480856417', '2012-03-08T16:39:25Z', '', '', '103'] The following line has incomplete info and therefore it's been removed from the dataset: ['35006269', '|Wikipedia:Motto of the day/March 17, 2012|', '4', '480856500', '2012-03-08T16:40:07Z', '', '', '83'] The following line has incomplete info and therefore it's been removed from the dataset: ['35006274', '|Wikipedia:Motto of the day/March 18, 2012|', '4', '480856592', '2012-03-08T16:40:42Z', '', '', '42'] The following line has incomplete info and therefore it's been removed from the dataset: ['36024342', '|Huang Wenshiung|', '0', '495739703', '2012-06-03T09:05:03Z', '', '', '25'] The following line has incomplete info and therefore it's been removed from the dataset: ['36026430', '|Peter Wen-shiung Huang|', '0', '495783189', '2012-06-03T15:23:42Z', '', '', '25'] The following line has incomplete info and therefore it's been removed from the dataset: ['36026676', '|Wikipedia:WikiProject India/Assessment/Tag & Assess 2012/Special Awards|', '4', '495788339', '2012-06-03T15:59:35Z', '', '', '904'] The following line has incomplete info and therefore it's been removed from the dataset: ['36205850', '|Wikipedia:Motto of the day/July 26, 2012|', '4', '498650288', '2012-06-21T11:40:31Z', '', '', '62'] The following line has incomplete info and therefore it's been removed from the dataset: ['36571346', '|Orange County High School of the Arts|', '0', '504675106', '2012-07-29T00:41:04Z', '', '', '46'] The following line has incomplete info and therefore it's been removed from the dataset: ['37247911', '|Talitres|', '0', '1074743338', '2022-03-01T23:09:57Z', '', '', '8195'] The following line has incomplete info and therefore it's been removed from the dataset: ['38599275', '|Shadow and Bone|', '0', '1088829869', '2022-05-20T10:14:13Z', '', '', '12712'] The following line has incomplete info and therefore it's been removed from the dataset: ['39545563', '|Wikipedia:Motto of the day/Invite a member|', '4', '557796486', '2013-06-01T07:10:37Z', '', '', '34'] The following line has incomplete info and therefore it's been removed from the dataset: ['39641975', '|Wikipedia:GARC|', '4', '559343973', '2013-06-11T05:37:11Z', '', '', '88'] The following line has incomplete info and therefore it's been removed from the dataset: ['44006850', '|Onoffice|', '0', '1039654385', '2021-08-19T23:45:32Z', '', '', '1960'] The following line has incomplete info and therefore it's been removed from the dataset: ['46732450', '|WDID-LD|', '0', '1077629783', '2022-03-17T10:10:52Z', '', '', '2725'] The following line has incomplete info and therefore it's been removed from the dataset: ['48219433', '|2001 Billboard Music Awards|', '0', '1044502235', '2021-09-15T15:32:37Z', '', '', '5917'] The following line has incomplete info and therefore it's been removed from the dataset: ['49014202', '|EA Sports UFC 2|', '0', '1085416659', '2022-04-30T13:00:29Z', '', '', '22402'] The following line has incomplete info and therefore it's been removed from the dataset: ['50982838', '|Ursitory|', '0', '1082943602', '2022-04-16T02:03:50Z', '', '', '4317'] The following line has incomplete info and therefore it's been removed from the dataset: ['51415212', '|Tram route 9 (Antwerp)|', '0', '1077147109', '2022-03-14T19:08:10Z', '', '', '5945'] The following line has incomplete info and therefore it's been removed from the dataset: ['51525443', '|Narelle Oliver|', '0', '1047663703', '2021-10-01T23:28:38Z', '', '', '5739'] The following line has incomplete info and therefore it's been removed from the dataset: ['54185525', '|Wikipedia:Long-term abuse/Country music category vandal from Tennessee|', '4', '1083350581', '2022-04-18T10:51:18Z', '', '', '11227'] The following line has incomplete info and therefore it's been removed from the dataset: ['61562084', '|Eastmark High School|', '0', '1076424508', '2022-03-11T01:55:07Z', '', '', '4332'] The following line has incomplete info and therefore it's been removed from the dataset: ['64258200', '|Miracle in the Desert: The Rise and Fall of the Salton Sea|', '0', '996176697', '2020-12-24T23:44:08Z', '', '', '4711'] The following line has incomplete info and therefore it's been removed from the dataset: ['65825642', "|Wikipedia:April Fools/April Fools' Day 2021|", '4', '1083996709', '2022-04-22T00:44:01Z', '', '', '54650'] The following line has incomplete info and therefore it's been removed from the dataset: ['65926438', '|National Anthem of the Republic of the Rif|', '0', '1059799777', '2021-12-11T18:13:27Z', '', '', '8013'] The following line has incomplete info and therefore it's been removed from the dataset: ['67498560', '|M-84AS1|', '0', '1080568438', '2022-04-02T01:08:09Z', '', '', '21148'] The following line has incomplete info and therefore it's been removed from the dataset: ['67692973', '|Organ language|', '0', '1063290919', '2022-01-02T05:48:26Z', '', '', '5382'] The following line has incomplete info and therefore it's been removed from the dataset: ['69858063', '|Wikipedia:Long-term abuse/Bucharest Wild Kratts and horror film vandal|', '4', '1082625536', '2022-04-14T05:53:12Z', '', '', '8354'] The following line has incomplete info and therefore it's been removed from the dataset: ['70552409', '|Draft:Meena caste category|', '118', '1083110833', '2022-04-17T03:00:24Z', '', '', '4168'] Done processing
True
output=pd.read_csv('enwiki-latest-pages-articles_metadata_example.csv')
df=pd.DataFrame(output)
df.head(10)
/Users/alialvarez/opt/anaconda3/envs/wikipedia/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (9,10,11,12,13,14,15,16) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
page_id | page_title | page_ns | revision_id | timestamp | contributor_id | contributor_name | bytes | COLUMN_1 | COLUMN_2 | COLUMN_3 | COLUMN_4 | COLUMN_5 | COLUMN_6 | COLUMN_7 | COLUMN_8 | COLUMN_9 | COLUMN_10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10 | |AccessibleComputing| | 0 | 1002250816 | 2021-01-23T15:15:01Z | 20842734 | |Elli| | 111 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 12 | |Anarchism| | 0 | 1085838220 | 2022-05-02T18:53:41Z | 31382403 | |BappleBusiness| | 105122 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 13 | |AfghanistanHistory| | 0 | 783865149 | 2017-06-05T04:18:18Z | 9784415 | |Tom.Reding| | 90 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 14 | |AfghanistanGeography| | 0 | 783865160 | 2017-06-05T04:18:23Z | 9784415 | |Tom.Reding| | 92 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 15 | |AfghanistanPeople| | 0 | 783865293 | 2017-06-05T04:19:42Z | 9784415 | |Tom.Reding| | 95 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | 18 | |AfghanistanCommunications| | 0 | 783865299 | 2017-06-05T04:19:45Z | 9784415 | |Tom.Reding| | 97 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
6 | 19 | |AfghanistanTransportations| | 0 | 783821589 | 2017-06-04T21:42:11Z | 9784415 | |Tom.Reding| | 113 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | 20 | |AfghanistanMilitary| | 0 | 1071065800 | 2022-02-10T17:56:28Z | 8066546 | |Xqbot| | 92 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
8 | 21 | |AfghanistanTransnationalIssues| | 0 | 783821743 | 2017-06-04T21:43:14Z | 9784415 | |Tom.Reding| | 101 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9 | 23 | |AssistiveTechnology| | 0 | 783865310 | 2017-06-05T04:19:50Z | 9784415 | |Tom.Reding| | 88 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
As you can see from the above example, the wiki_dump_parser only allows to retrieve basic metadata information (like page_id), but does not render the markup information. Additionally, the parser does not handle well some of the lines, forcing the values to extend to 10 other columns.
from wiki_dump_reader import Cleaner, iterate
wiki={}
cleaner = Cleaner()
for title, text in iterate('enwiki-latest-pages-articles.xml'):
orig_text=text
text = cleaner.clean_text(text)
cleaned_text, links = cleaner.build_links(text)
#add files to dictionary
wiki.update({title:[cleaned_text]})
df_wiki=pd.DataFrame.from_dict(wiki, orient='index')
df_wiki.columns=['cleaned']
df_wiki.to_csv(os.path.join(os.getcwd(), 'wiki_dump_example.csv'))
df_wiki.head(5)
cleaned | |
---|---|
AccessibleComputing | REDIRECT Computer accessibility |
Anarchism | Anarchism is a political philosophy and moveme... |
AfghanistanHistory | REDIRECT History of Afghanistan |
AfghanistanGeography | REDIRECT Geography of Afghanistan |
AfghanistanPeople | REDIRECT Demographics of Afghanistan |
runcmd("./extract_and_clean_wiki_dump.sh", verbose = True)
import wikiextractor
# Our subject targeted is:
subject='ted talk speakers'
# To obtain a search on the subject:
import wikipedia
search_result = wikipedia.search(subject)
print('the search result is:')
print(search_result)
result=search_result[1]
print('')
print('the selected page is:', result)
# to obtain a URL base on the search result
url=(result).replace(" ", "_")
url= 'https://en.wikipedia.org/wiki/'+ url
print(url)
#obtain website
result=wikipedia.page(result)
#parse attributes library provides
title=result.title
summary = result.summary
categories=result.categories
content = result.content
links = result.links
references = result.references
# print info
print("Page content:\n", content, "\n")
print("Page title:", title, "\n")
print("Categories:", categories, "\n")
print("Links:", links, "\n")
print("References:", references, "\n")
print("Summary:", summary, "\n")
print("Page title:", title, "\n")
html=result.html()
html
html=pd.read_html(url)
print(type(html))
print(html)
#using beautiful soup to dig into the html
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen (url)
bsObj = BeautifulSoup(html.read (), 'html.parser')
print(bsObj)
def tag_list(tag):
"""
This function extracts the list of tags and returns the list. It uses the findAll function of BeautifulSoup
"""
soup=bsObj
a=[]
content=soup.findAll(tag)
for item in content:
a.append(item.get_text())
return a
authors= (tag_list("td")
[6:] #remove first 6 values that do not correspond to authors or talks
)
authors
import re
#Parse HTML to find urls of speakers
txt = str(bsObj)
result=[]
# reg='(?<=a href=)(.*)(?=title)'
# reg='(?<=data-sort-value=)(.*)(?=title)'
reg='(?<=td>)(.*)(?=</td>)'
reobj = re.compile(reg)
for matchobj in reobj.finditer(txt):
result.append(matchobj[1 ])
for line in result:
print (line)
## Using Scrapy Library
Sources: Wiki-Dump-Reader Package How-to Article Use Markup Library extractor Example of cleaning dump