Download our e-book of Introduction To Python
Kajal Pawar
3 years ago
#import required libraries
import numpy as np
import os
from random import shuffle
import re
import urllib.request
import zipfile
import lxml.etree
#download the data
urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")
# extract subtitle
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_data = '\n'.join(doc.xpath('//content/text()'))
# remove parenthesis
input_text_noparens = re.sub(r'\([^)]*\)', '', input_data)
# store as list of sentences
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
m = re.match(r'^(?:(?P[^:]{,20}):)?(?P.*)$', line)
sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
# store as list of lists of words
clean_sentences_ted = []
for sent_str in sentences_strings_ted:
tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
clean_sentences_ted.append(tokens)
[('arthritis', 0.7805954217910767),
('h1n1', 0.7803696990013123),
('cdc', 0.7635983228683472),
('amygdala', 0.7610822319984436),
('penitentiary', 0.7592360973358154),
('tuscaloosa', 0.7576302289962769),
('inflammatory', 0.7538164854049683),
('amyloid', 0.7519340515136719),
('cesarean', 0.7514520883560181),
('aromatase', 0.7490667104721069)]