World's Best AI Learning Platform with profoundly Demanding Certification Programs
Designed by IITians, only for AI Learners.
Designed by IITians, only for AI Learners.
New to InsideAIML? Create an account
Employer? Create an account
Download our e-book of Introduction To Python
4.5 (1,292 Ratings)
559 Learners
Kajal Pawar
a year ago
#import required libraries
import numpy as np
import os
from random import shuffle
import re
import urllib.request
import zipfile
import lxml.etree
#download the data
urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")
# extract subtitle
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_data = '\n'.join(doc.xpath('//content/text()'))
# remove parenthesis
input_text_noparens = re.sub(r'\([^)]*\)', '', input_data)
# store as list of sentences
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
m = re.match(r'^(?:(?P[^:]{,20}):)?(?P.*)$', line)
sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
# store as list of lists of words
clean_sentences_ted = []
for sent_str in sentences_strings_ted:
tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
clean_sentences_ted.append(tokens)
[('arthritis', 0.7805954217910767),
('h1n1', 0.7803696990013123),
('cdc', 0.7635983228683472),
('amygdala', 0.7610822319984436),
('penitentiary', 0.7592360973358154),
('tuscaloosa', 0.7576302289962769),
('inflammatory', 0.7538164854049683),
('amyloid', 0.7519340515136719),
('cesarean', 0.7514520883560181),
('aromatase', 0.7490667104721069)]