|  1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80 | # using set theory to select words
# lengthy words of 15 characters or more
# {w | w E V & P(w)}
# [w for w in V if P(w)]
# the set of all w such that w is an element of V (vocab) and has property P
# get set of vocab in text1
In [10]: V = set(text1)
# iterate through V, grabbing each word with character length greater than 15
In [11]: long_words = [w for w in V if len(w) > 15]
# display sorted first 10 lengthy words
In [12]: sorted(long_words)[:10]
Out[12]:
[u'CIRCUMNAVIGATION',
 u'Physiognomically',
 u'apprehensiveness',
 u'cannibalistically',
 u'characteristically',
 u'circumnavigating',
 u'circumnavigation',
 u'circumnavigations',
 u'comprehensiveness',
 u'hermaphroditical']
#####
# looking at internet long word patterns
# more than 15 characters
# check which text number is internet chat, #5 
In [13]: texts()
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
# create unique vocab set
In [14]: vocab = set(text5)
# iterate through vocab for words greater than 15 characters in length
In [15]: long_chat_words = [word for word in vocab if len(word) > 15]
# display first 10 sorted
In [16]: sorted(long_chat_words)[:10]
Out[16]:
[u'!!!!!!!!!!!!!!!!',
 u'!!!!!!!!!!!!!!!!!!!!!!',
 u'!!!!!!!!!!!!!!!!!!!!!!!',
 u'!!!!!!!!!!!!!!!!!!!!!!!!!!!',
 u'!!!!!!!!!!!!!!!!!!!!!!!!!!!!',
 u'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!',
 u'#talkcity_adults',
 u'(((((((((((((((((',
 u'((((((((((((((((((',
 u'((((((((((((((((((((']
# display 101st to 110th sorted, no results 
In [17]: sorted(long_chat_words)[100:111]
Out[17]: []
# index from last for last 10
# observe exaggerated chat patterns
In [18]: sorted(long_chat_words)[-10:]
Out[18]:
[u'oooooooooooooonnnnnnnnnnnneeeeeeeeeeeeeeesssssssss',
 u'raaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
 u'tatatatnanaantatat',
 u'weeeeeeeeeeeeeeee',
 u'weeeeeeeeeeeeeeeeeeeeeeeeed',
 u'wheeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee',
 u'woooooooooaaaahhhhhhhhhhhh',
 u'wooooooooooooohoooooooooooooooo',
 u'www.Wunderground.com',
 u'yuuuuuuuuuuuummmmmmmmmmmm']
 |