1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
| # using set theory to select words
# lengthy words of 15 characters or more
# {w | w E V & P(w)}
# [w for w in V if P(w)]
# the set of all w such that w is an element of V (vocab) and has property P
# get set of vocab in text1
In [10]: V = set(text1)
# iterate through V, grabbing each word with character length greater than 15
In [11]: long_words = [w for w in V if len(w) > 15]
# display sorted first 10 lengthy words
In [12]: sorted(long_words)[:10]
Out[12]:
[u'CIRCUMNAVIGATION',
u'Physiognomically',
u'apprehensiveness',
u'cannibalistically',
u'characteristically',
u'circumnavigating',
u'circumnavigation',
u'circumnavigations',
u'comprehensiveness',
u'hermaphroditical']
#####
# looking at internet long word patterns
# more than 15 characters
# check which text number is internet chat, #5
In [13]: texts()
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
# create unique vocab set
In [14]: vocab = set(text5)
# iterate through vocab for words greater than 15 characters in length
In [15]: long_chat_words = [word for word in vocab if len(word) > 15]
# display first 10 sorted
In [16]: sorted(long_chat_words)[:10]
Out[16]:
[u'!!!!!!!!!!!!!!!!',
u'!!!!!!!!!!!!!!!!!!!!!!',
u'!!!!!!!!!!!!!!!!!!!!!!!',
u'!!!!!!!!!!!!!!!!!!!!!!!!!!!',
u'!!!!!!!!!!!!!!!!!!!!!!!!!!!!',
u'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!',
u'#talkcity_adults',
u'(((((((((((((((((',
u'((((((((((((((((((',
u'((((((((((((((((((((']
# display 101st to 110th sorted, no results
In [17]: sorted(long_chat_words)[100:111]
Out[17]: []
# index from last for last 10
# observe exaggerated chat patterns
In [18]: sorted(long_chat_words)[-10:]
Out[18]:
[u'oooooooooooooonnnnnnnnnnnneeeeeeeeeeeeeeesssssssss',
u'raaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
u'tatatatnanaantatat',
u'weeeeeeeeeeeeeeee',
u'weeeeeeeeeeeeeeeeeeeeeeeeed',
u'wheeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee',
u'woooooooooaaaahhhhhhhhhhhh',
u'wooooooooooooohoooooooooooooooo',
u'www.Wunderground.com',
u'yuuuuuuuuuuuummmmmmmmmmmm']
|