Python Practices¶

1. Simple Math and Print¶

Note: Python will not print decimals if none of the numbers is an integer.

a = 2
b = 3.
c = b/a
print(c)

2. String: Split and Joint, Count¶

text = "The quick brown fox jumped over the lazy log."
mytext = text.split()
print(mytext)
print(" ".join(mytext))
print(text.find("fox"))

count = 0
for x in mytext:
    if "e" in x:
        count += 1
        
print("'e' is repeated",count, "times")

3. User Input as Integer¶

The user input is taken as "b" and compared to the variable "a".

a = 5
b = int(input("Give me a number:"))
if b == a:
    print("Yes, right")
else:
    print("No, No..!")

4. Open and Read Files¶

First create a file named "ques.txt" and enter some questions in it. Read the file then in your Python code. Consider that it is an interview and you need to register the time as well. Ask the person name first and then save the file under his name.

import datetime
file = open('ques.txt', 'r')
username = input("What is your name? ")
myfile = open(username+".txt", "a")
myfile.close()
output = open(username+".txt", "r+")
for line in file:
    the_reply = input(line)
    line1= line+"{0}: {1} \n".format(username, the_reply)
    output.write(line1)

output.write("\n")
output.write("Interview Time: "+str(datetime.datetime.now().time()))  
output.close()

5. Read a File¶

Read the file that you created line by line with answers from the interviewee.

file = open("User.txt", "r")
mylines = file.readlines()

for line in mylines:
    my_next = mylines.index(line)+1
    if "How old are you?" in line:
        print(mylines[my_next])
    if "What is your job?" in line:
        print(mylines[my_next])
    if "Where are you from?" in line:
        print(mylines[my_next])

6. Adding Answers to Dictionary¶

Create an empty dictionary and add the questions and answers to it.

file = open("User.txt", "r")
mylines = file.readlines()
mydict = {}
for line in mylines:
    my_next = mylines.index(line)+1
    if "How old are you?" in line:
        mydict[mylines[my_next]]=mylines.index(line)
    if "What is your job?" in line:
        mydict[mylines[my_next]]=mylines.index(line)
    if "Where are you from?" in line:
        mydict[mylines[my_next]]=mylines.index(line)

print(mydict)

7. Sort the Dictionary¶

To know how to sort the dict.

file = open("User.txt", "r")
mylines = file.readlines()

mydict = {}
for line in mylines:
    my_line = line.replace("\n", "")
    mydict[my_line]=mylines.index(line)


print(sorted(mydict.items(), key=lambda x: x[1]))

8. How to Run a Flask Webserver¶

from flask import Flask
from flask import request

app = Flask(__name__)

@app.route('/')
def index():
    return 'Welcome'
    
@app.route('/login')
def login():
    username = request.args.get('username')
    password = request.args.get('password')
    return '%s is' %username

@app.route('/user/<username>')
def show_user_profile(username):
    # show the user profile for that user
    return '<html><body>This is a tes to see if <span style="font-weight:bold">%s</span> is shown in middle of html text</body></html>' % username

@app.route('/post/<int:post_id>')
def show_post(post_id):
    # show the post with the given id, the id is an integer
    return 'Post %d' % post_id

@app.route('/test')
def test():
    return 'this is a test'


if __name__ == '__main__':
    app.run(host='127.0.0.1', port=5000)

9. Command Line Arguments¶

To run the code, type in the command line:

$ python test.py arg1 arg2 arg3

in which arguments can be for example "user1.txt" "user2.txt", etc

#!/usr/bin/python
import sys
 
# Get the total number of args passed to the demo.py
total = len(sys.argv)
 
# Get the arguments list 
cmdargs = str(sys.argv)
 
# Print it
print ("The total numbers of args passed to the script: %d " % total)
print ("Args list: %s " % cmdargs)
mydict = {}
excepts = sys.argv[0]
sys.argv.remove(excepts)

for x in sys.argv:
    print(x, end=" ")
    file = open(x, "r")
    f = file.readlines()
    for line in f:
        print(line, end=" ")

10. Making Dictionary from a Text File¶

NLTK tokenizer is used to separate the sentences. The structure is somehow what I need. It should be completed!

import glob
from nltk.tokenize import sent_tokenize

my_files = glob.glob('Arsam.txt')
mylist = []
for i in my_files:
    file = open(i, 'r')
    data = file.read()
    sent_tokenize_list = sent_tokenize(data)
    count = len(sent_tokenize_list)
    mylist.append(i)
    for x in range(count):
        myformat = (sent_tokenize_list[x],'MX')
        mylist.append(myformat)

    print(mylist)

11. A bit of Node.Js (Sorry Python)¶

Save the code as "server.js" and run it with:

$ node server.js

var express = require('express');
var app = express();

app.get('/', function (req, res) {
   res.send('Hello World');
})

var server = app.listen(8081, function () {
   var host = server.address().address
   var port = server.address().port
   
   console.log("Example app listening at http://%s:%s", host, port)
})

12. Extract Links from a URL¶

Using BeautifulSoup

import urllib2
from bs4 import BeautifulSoup
url = urllib2.urlopen("http://aclanthology.info/").read()
soup = BeautifulSoup(url)
for line in soup.find_all('bib'):
    print(line.get('href'))

13. Idiomic Python¶

Some practices for loop through data in the lists

z = [ 'a', 'b', 'c', 'd' ]
i = 0
while i < len(z):
    print(i, z[i])
    i += 1
    
#adds a line between results
print()

for i in range(0, len(z)):
    print(i, z[i])

#adds a line between results
print()    


for i, item in enumerate(z):
    print(i, item)

#adds a line between results
print()

14. Fun with dictionary¶

e = dict(a=5, b=4, c=18)
print(e)

{'a': 5, 'b': 4, 'c': 18}

and more

d = {}
d['a'] = 5
d['b'] = 4
d['c'] = 18
print(d)

{'a': 5, 'b': 4, 'c': 18}

and even more

items = d.items()
print(items)

dict_items([('a', 5), ('b', 4), ('c', 18)])

and get them sorted

sorted(items)
print(items)

dict_items([('a', 5), ('b', 4), ('c', 18)])

15. Calculating "Average" and "Standard Division" for a list¶

import math
data = [ 1, 2, 3, 4, 5 ]
average = sum(data) / float(len(data))
stddev = sum([ (x - average)**2 for x in data ]) / float(len(data))
stddev = math.sqrt(stddev)
print(average, '+/-', stddev)

16. NLTK Corpus Add Categories¶

import nltk
from nltk.corpus import *
corpus_root = '../Downloads/soft'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
reader = CategorizedPlaintextCorpusReader('', r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())

17. NLTK Stopwords Check¶

import nltk
from nltk.corpus import stopwords

def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    words = [w for w in text.split() if w in stopwords]
    return words

print(content_fraction('This is a test for the stopwords.'))

18. Encoding in Python File¶

Start the file with # -- coding: utf-8 --

# -*- coding: utf-8 -*-
s = "این یک تست است"
print(s.split())

19. Counting the words in a directory¶

# import os
from nltk import *

counter = []
def wordcounter(file):
    path = '../Downloads/soft/news/'+file
    f = open(path, 'r')
    mylist = []
    for word in f:
        sublist = word_tokenize(word)
        mylist.append(sublist)
    list = []
    for item in mylist:
        for i in item:
            if i == "``":
                pass
            elif i == ',':
                pass
            elif i == '.':
                pass
            elif i == "''":
                pass
            else:
                list.append(i)
    counter.append(len(list))
    return len(list)

directory = os.listdir('../Downloads/soft/news')
for file in directory:
    print(file, ":", wordcounter(file), "words")
    
print("\nDirectory contains", len(counter), "files, with a total number of", sum(counter), "words.")

20. Regular Expression: Optional "-"¶

import re
s = "This is an email address.".split()
wordlist = [w for w in s  if re.search('^e-?mail$', w)]
print(wordlist)

21. Finding and counting words in custom corpus¶

from nltk.corpus import *
import re
corpus_root = '/home/dariush/Downloads/soft'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
corpus = CategorizedPlaintextCorpusReader('', r'.*\.txt', cat_pattern=r'(\w+)/*')
#print(corpus.categories())
words = corpus.words(categories=['news','general'])
found_words = [w for w in words for x in re.findall(r'stud.', w)]
#print(sorted(found_words))
print(nltk.FreqDist(found_words).most_common(100))
fdist = nltk.FreqDist(sorted(found_words))
print(fdist.N())
fdist.tabulate()
fdist.plot()
for w in fdist:
    print(w, ":", fdist[w])

22. Concordance in custom corpus¶

import nltk
from nltk.corpus import *
import re
corpus_root = '/home/dariush/Downloads/soft'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
corpus = CategorizedPlaintextCorpusReader('', r'.*\.txt', cat_pattern=r'(\w+)/*')
#print(corpus.categories())
words = corpus.words(categories=['news','general'])
found_words = [w for w in words for x in re.findall(r'Mal', w)]
#print(sorted(found_words))
#print(nltk.FreqDist(found_words).most_common(100))
fdist = nltk.FreqDist(sorted(found_words))
#print(fdist.N())
fdist.tabulate()
fdist.plot()

text = nltk.Text(words)
for w in fdist:
    text.concordance(w)

23. Word Stem with NLTK¶

import nltk
from nltk.corpus import *
import re
raw = "I tested the tests for the teachers"
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
tokens = nltk.word_tokenize(raw)
list = []
for w in tokens:
    stem = re.findall(regexp, w)
    list.append(stem[0])

for item in list:
    if item[1] == '':
        pass
    else:
        print(item)

('test', 'ed')
('test', 's')
('teacher', 's')

24. Simple Segmentation with Zero One Pattern¶

def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words

sent = "Researchonthe"
seg2 = "0000000101001"
print(segment(sent, seg2))

['Research', 'on', 'the', '']

25. Finding longest word in a text¶

text = 'This is a sample text with some sentences and we are going to find the longest word in this text' 
words = text.split()
maxlen = max(len(t) for t in words)
longest = [w for w in words if len(w) == maxlen]
print(longest, maxlen, 'characters')

26. Lambda usage: calculator in 2 lines¶

calculator = lambda x : print(eval(x))
calculator(input("Enter a math expression: "))

Enter a math expression: 34/45
0.7555555555555555

27. NLTK Str2tuple: Constructing tagged corpus¶

import nltk

sent = 'The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN'
tagged_sent = [nltk.tag.str2tuple(w)for w in sent.split()]
print(tagged_sent)

[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'), ('on', 'IN'), ('a', 'AT'), ('number', 'NN'), ('of', 'IN')]

28. Finding the most common Tags in a custom corpus¶

import nltk
from nltk.corpus import *
import re

corpus = nltk.corpus.CategorizedPlaintextCorpusReader('',r'.*\.txt',cat_pattern=r'(\w+)/*')
sents = corpus.sents(categories=['news','general'])
tagged_sent = [nltk.pos_tag(sent) for sent in sents]
tag_list = [tag[1][1] for tag in tagged_sent]
fd = nltk.FreqDist(tag_list)
print(fd.most_common())

29. Finding POS of the precedings of a word / Reverse POS search¶

import nltk

corpus = nltk.corpus.CategorizedPlaintextCorpusReader('',r'.*\.txt',cat_pattern=r'(\w+)/*')
sents = corpus.sents(categories=['news','general'])
word_tag_pairs = [nltk.pos_tag(sent) for sent in sents]
all_tags = [w for w in word_tag_pairs]
precedings = []
for w in all_tags:
    for a in w:
        if a[1] == 'VBP':
            current = w.index(a)
            preceding = w[current-1]
            precedings.append(preceding)
            
fdist = nltk.FreqDist([w[1] for w in precedings])
print(fdist.most_common())
cfd1 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in precedings)
print([w for w in cfd1['RB'].most_common()])

30. Search Bigrams in corpus + POS statistics¶

import nltk
from nltk.corpus import *


corpus = CategorizedPlaintextCorpusReader('',r'.*\.txt', cat_pattern=r'(\w+)/*')
text = corpus.words(categories=['news','general'])
bigrams = nltk.bigrams(text)
keyword = 'which'
bigram_list = []
bigram_list.append([x[1] for x in bigrams if x[0] == keyword])
print(bigram_list, '\n\n')
tags = [nltk.pos_tag(word) for word in bigram_list]
print(tags, '\n\n')
taglist = [x[1] for x in tags[0]]
fdist = nltk.FreqDist(taglist)
fdist.tabulate()
fdist.plot()

31. Trigram in custom corpus¶

import nltk

def process(sentence):
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            #print(sentence, '\n')
            print(w1, w2, w3)

corpus = nltk.corpus.CategorizedPlaintextCorpusReader('',r'.*\.txt', cat_pattern=r'(\w+)/*')
text = corpus.sents(categories=['news','general'])
for sent in text:
    process(nltk.pos_tag(sent))

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-1-1a724a3e632a> in <module>()
      8 
      9 corpus = nltk.corpus.CategorizedPlaintextCorpusReader('',r'.*\.txt', cat_pattern=r'(\w+)/*')
---> 10 text = corpus.sents(categories=['news','general'])
     11 for sent in text:
     12     process(nltk.pos_tag(sent))

/home/dariush/anaconda3/lib/python3.6/site-packages/nltk/corpus/reader/plaintext.py in sents(self, fileids, categories)
    169     def sents(self, fileids=None, categories=None):
    170         return PlaintextCorpusReader.sents(
--> 171             self, self._resolve(fileids, categories))
    172     def paras(self, fileids=None, categories=None):
    173         return PlaintextCorpusReader.paras(

/home/dariush/anaconda3/lib/python3.6/site-packages/nltk/corpus/reader/plaintext.py in sents(self, fileids)
    101         return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
    102                        for (path, enc, fileid)
--> 103                        in self.abspaths(fileids, True, True)])
    104 
    105     def paras(self, fileids=None):

/home/dariush/anaconda3/lib/python3.6/site-packages/nltk/corpus/reader/util.py in concat(docs)
    420         return docs[0]
    421     if len(docs) == 0:
--> 422         raise ValueError('concat() expects at least one object!')
    423 
    424     types = set(d.__class__ for d in docs)

ValueError: concat() expects at least one object!

32. Making list from list of lists¶

all_list = [[('item1')],[('item2')],[('item3')]]
flat_list = [item for sublist in all_list for item in sublist]
print(flat_list)

['item1', 'item2', 'item3']

33. NLTK Finding Frequency¶

import nltk
sent = 'this is a a a a a test for the test'
fdist = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(sent))
fdist.tabulate()

   a test this   is  for  the 
   5    2    1    1    1    1

34. Get unique values from a list¶

tags = ['IN', 'DT', 'JJ', 'NN', 'VBZ', 'VBN', 'VBN', 'DT','IN','NN']
tags = set(tags)
print(tags)

{'JJ', 'IN', 'NN', 'VBN', 'VBZ', 'DT'}

35. Import CSV data and Tabulate it¶

import csv as csv
from tabulate import tabulate
from operator import itemgetter

country = 'Switzerland'
year = '2015'
max_rank = 100

readdata = csv.reader(open("cwurData.csv"))
alldata = []
for data in readdata:
    alldata.append(data)

countrydata = alldata[1:]
record = [(itemgetter(0,1,3)(record)) for record in countrydata if record[2] == country and record[-1] == year and int(record[0]) <= max_rank]
print('\n\n', country, 'TOP UNIVERSITIES (<',max_rank,') (',year,')\n')
print(tabulate(record,headers=(itemgetter(0,1,3)(alldata[0]))))
print('\n\nWORLD TOP', max_rank, 'UNIVERSITIES (',year,')\n')
data = [record[:3] for record in countrydata if int(record[0]) <= max_rank and record[-1] == year]
print(tabulate(data,headers=alldata[0]))


 Switzerland TOP UNIVERSITIES (< 100 ) ( 2015 )

  world_rank  institution                                          national_rank
------------  -------------------------------------------------  ---------------
          20  Swiss Federal Institute of Technology in Zurich                  1
          91  Swiss Federal Institute of Technology in Lausanne                2
          92  University of Geneva                                             3
          96  University of Zurich                                             4


WORLD TOP 100 UNIVERSITIES ( 2015 )

  world_rank  institution                                        country
------------  -------------------------------------------------  --------------
           1  Harvard University                                 USA
           2  Stanford University                                USA
           3  Massachusetts Institute of Technology              USA
           4  University of Cambridge                            United Kingdom
           5  University of Oxford                               United Kingdom
           6  Columbia University                                USA
           7  University of California, Berkeley                 USA
           8  University of Chicago                              USA
           9  Princeton University                               USA
          10  Cornell University                                 USA
          11  Yale University                                    USA
          12  California Institute of Technology                 USA
          13  University of Tokyo                                Japan
          14  University of Pennsylvania                         USA
          15  University of California, Los Angeles              USA
          16  Johns Hopkins University                           USA
          17  Kyoto University                                   Japan
          18  New York University                                USA
          19  University of Michigan, Ann Arbor                  USA
          20  Swiss Federal Institute of Technology in Zurich    Switzerland
          21  University of California, San Diego                USA
          22  Northwestern University                            USA
          23  Hebrew University of Jerusalem                     Israel
          24  Seoul National University                          South Korea
          25  University of Wisconsin–Madison                    USA
          26  University of California, San Francisco            USA
          27  University College London                          United Kingdom
          28  Duke University                                    USA
          29  Rockefeller University                             USA
          30  University of Texas at Austin                      USA
          31  University of Washington - Seattle                 USA
          32  University of Toronto                              Canada
          33  University of Illinois at Urbana–Champaign         USA
          34  Keio University                                    Japan
          35  Imperial College London                            United Kingdom
          36  École Polytechnique                                France
          37  École normale supérieure - Paris                   France
          38  Waseda University                                  Japan
          39  Weizmann Institute of Science                      Israel
          40  University of North Carolina at Chapel Hill        USA
          41  University of Virginia                             USA
          42  McGill University                                  Canada
          43  Purdue University, West Lafayette                  USA
          44  Dartmouth College                                  USA
          45  Osaka University                                   Japan
          46  University of Pittsburgh - Pittsburgh Campus       USA
          47  Pennsylvania State University, University Park     USA
          48  University of Minnesota, Twin Cities               USA
          49  Ohio State University, Columbus                    USA
          50  Rutgers University-New Brunswick                   USA
          51  University of Southern California                  USA
          52  Washington University in St. Louis                 USA
          53  University of California, Davis                    USA
          54  University of Colorado Boulder                     USA
          55  University of Edinburgh                            United Kingdom
          56  Peking University                                  China
          57  National Taiwan University                         Taiwan
          58  University of Paris-Sud                            France
          59  Lomonosov Moscow State University                  Russia
          60  University of Florida                              USA
          61  Carnegie Mellon University                         USA
          62  University of British Columbia                     Canada
          63  Pierre-and-Marie-Curie University                  France
          64  University of California, Santa Barbara            USA
          65  National University of Singapore                   Singapore
          66  Boston University                                  USA
          67  University of Manchester                           United Kingdom
          68  University of Arizona                              USA
          69  Vanderbilt University                              USA
          70  University of Utah                                 USA
          71  Karolinska Institute                               Sweden
          72  University of Maryland, College Park               USA
          73  Tohoku University                                  Japan
          74  University of Copenhagen                           Denmark
          75  University of Texas Southwestern Medical Center    USA
          76  Ruprecht Karl University of Heidelberg             Germany
          77  University of Notre Dame                           USA
          78  Tsinghua University                                China
          79  Ludwig Maximilian University of Munich             Germany
          80  Georgia Institute of Technology                    USA
          81  University of Rochester                            USA
          82  Katholieke Universiteit Leuven                     Belgium
          83  Brown University                                   USA
          84  Emory University                                   USA
          85  Erasmus University Rotterdam                       Netherlands
          86  Tel Aviv University                                Israel
          87  Utrecht University                                 Netherlands
          88  University of Sydney                               Australia
          89  University of California, Irvine                   USA
          90  University of Texas MD Anderson Cancer Center      USA
          91  Swiss Federal Institute of Technology in Lausanne  Switzerland
          92  University of Geneva                               Switzerland
          93  University of Melbourne                            Australia
          94  King's College London                              United Kingdom
          95  Tufts University                                   USA
          96  University of Zurich                               Switzerland
          97  Arizona State University                           USA
          98  Yonsei University                                  South Korea
          99  University of Oslo                                 Norway
         100  Kyushu University                                  Japan

36. Extract and visualize data from CSV file¶

- Change the country and the number of slices in explode() adopts with the number of results 
- Change the number of 'max_rank' to find more results
- Library itemgetter() helps to select specific indexes from a list, e.g. item[0], item[3] 
- 'STRING'.title() capitalizes the first letters in the string
- Make a list with given string/int with "list = ['item'] * 10"

import csv as csv
from tabulate import tabulate
from operator import itemgetter
import matplotlib.pyplot as plt

country = 'United Kingdom'
year = '2015'
max_rank = 100

readdata = csv.reader(open("cwurData.csv"))
alldata = []
for data in readdata:
    alldata.append(data)
  
title = alldata[0]
#print(title)
countrydata = alldata[1:]
record = [(itemgetter(0,1,3,12)(record)) for record in countrydata if record[2] == country and record[-1] == year and int(record[0]) <= max_rank]
print('\n\n', country, 'Top Universities (<',max_rank,') (',year,')\n')
print(tabulate(record,headers=(itemgetter(0,1,3,12)(alldata[0]))))

print('\n\n', country, 'Top Universities Based on',title[12].title())
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = [record[1] for record in countrydata if record[2] == country and record[-1] == year and int(record[0]) <= max_rank]
sizes = [record[12] for record in countrydata if record[2] == country and record[-1] == year and int(record[0]) <= max_rank]
slices = [0] * len(sizes)
explode = (slices)  
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  
plt.show()


 United Kingdom Top Universities (< 100 ) ( 2015 )

  world_rank  institution                  national_rank    score
------------  -------------------------  ---------------  -------
           4  University of Cambridge                  1    96.81
           5  University of Oxford                     2    96.46
          27  University College London                3    62.27
          35  Imperial College London                  4    59.61
          55  University of Edinburgh                  5    54.39
          67  University of Manchester                 6    53.38
          94  King's College London                    7    51.31


 United Kingdom Top Universities Based on Score

37. Automatic Column Extraction from CSV¶

Change 'column_name' vlaue to 'citations', 'score','world_rank' etc to see a different table

import csv as csv
from tabulate import tabulate
from operator import itemgetter
import matplotlib.pyplot as plt

country = 'Germany'
year = '2015'
max_rank = 100
column_name = 'score'

readdata = csv.reader(open("cwurData.csv"))
alldata = []
for data in readdata:
    alldata.append(data)
  
column_title = dict([(v,k) for k, v in enumerate(alldata[0])])
column = column_title[column_name]

countrydata = alldata[1:]
record = [(data[0],data[1],data[3],data[column]) for data in countrydata if data[2] == country and data[-1] == year and int(data[0]) <= max_rank]
print('\n\n', country, 'Top Universities (<',max_rank,') (',year,')\n')
print(tabulate(record,headers=(itemgetter(0,1,3,column)(alldata[0]))))


 Germany Top Universities (< 100 ) ( 2015 )

  world_rank  institution                               national_rank    score
------------  --------------------------------------  ---------------  -------
          76  Ruprecht Karl University of Heidelberg                1    52.32
          79  Ludwig Maximilian University of Munich                2    52.16

38. Defultdict() - Map empty dictionary entries to 0 or key¶

from collections import defaultdict
pos = defaultdict(lambda: 'UNK')
sentence = 'this is a test'
pos['this'] = 'DT'
pos['test'] = ['NOUN','VERB']
for word in sentence.split():
    print(word, pos[word])

this DT
is UNK
a UNK
test ['NOUN', 'VERB']

39. Cleaning Punctuations from a String¶

sentence = 'This, is a list! full of . , punctions:'
cleaned  = "".join(c for c in sentence if c not in ('!',':','\'','"','`',',','.'))
print(cleaned)

This is a list full of   punctions

40. Tag all sentences in corpus and taking Ngram, Bigram, Trigram, etc¶

import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
import string
import re

corpus = nltk.corpus.CategorizedPlaintextCorpusReader('',r'.*\.txt',cat_pattern=r'(\w+)/*')
sents = corpus.sents(categories=['news','general'])
word_tag_pairs = [nltk.pos_tag(sent) for sent in sents]
all_tags = [w for w in word_tag_pairs]

cleaned_sents = []
for sent in sents:
    cleaned = [c for c in sent if c not in ('!',':','\'','"','`',',')]
    cleaned_sent = nltk.pos_tag(cleaned)
    cleaned_sents.append(cleaned_sent)

words = []
for entry in cleaned_sents:
    for word in entry:
        pattern = word[0],'/',word[1]
        words.append(pattern)

text = []
for tagged_word in words:
    tagged = ''.join(tagged_word)
    text.append(tagged)

tagged_text = ' '.join(text)
#print(tagged_text)


token = nltk.word_tokenize(tagged_text)
bigrams = ngrams(token,2)
trigrams = ngrams(token,3)
fourgrams = ngrams(token,4)
fivegrams = ngrams(token,5)


a = Counter(fourgrams)
freqdist = nltk.FreqDist(a)
b = freqdist.most_common(50)
for i in b:
    print(i)

41. TRAIN corpus chunk + Search Chunk in Sentence¶

import nltk
from tabulate import tabulate
from operator import itemgetter
import matplotlib.pyplot as plt
from nltk.util import ngrams
from collections import Counter

corpus = nltk.corpus.CategorizedPlaintextCorpusReader('',r'.*\.txt',cat_pattern=r'(\w+)/*')
sents = corpus.sents(categories=['news','general'])
word_tag_pairs = [nltk.pos_tag(sent) for sent in sents]
all_tags = [w for w in word_tag_pairs]

cleaned_sents = []
for sent in sents:
    cleaned = [c for c in sent if c not in ('!',':','\'','"','`',',')]
    cleaned_sent = nltk.pos_tag(cleaned)
    cleaned_sents.append(cleaned_sent)

words = []
for entry in cleaned_sents:
    for word in entry:
        pattern = word[0],'/',word[1]
        words.append(pattern)

text = []
for tagged_word in words:
    tagged = ''.join(tagged_word)
    text.append(tagged)

tagged_text = ' '.join(text)
#print(tagged_text)


token = nltk.word_tokenize(tagged_text)
bigrams = ngrams(token,2)
trigrams = ngrams(token,3)
fourgrams = ngrams(token,4)
fivegrams = ngrams(token,5)


fourgram_bundle = Counter(fourgrams)
freqdist = nltk.FreqDist(fourgram_bundle)
bundles = freqdist.most_common(100)

entries = [a for a in bundles]
#print(tabulate(entries))


### END OF TRAINING

sentence = 'It has negative impact on us.'
tagged_sentence = nltk.pos_tag(sentence.split())

sentence_words = []
for word in tagged_sentence:
    pattern = word[0],'/',word[1]
    sentence_words.append(pattern)

tagged_sentence = []
for tagged_word in sentence_words:
    tagged = ''.join(tagged_word)
    tagged_sentence.append(tagged)

#final_tagged_sent = ' '.join(tagged_sentence)
#print(tagged_sentence)

sentence_fourgrams = ngrams(tagged_sentence,4)
for x in sentence_fourgrams:
    for entry in entries:
        if entry[0] == x:
            print(entry)

42. Classifier with Ngram as the featureset¶

import nltk
from nltk import ngrams
import random

corpus = nltk.corpus.CategorizedPlaintextCorpusReader('', r'.*\.txt', cat_pattern=r'(\w+)/*')
documents = [(list(corpus.words(fileid)), category)
              for category in corpus.categories()
              for fileid in corpus.fileids(category)]

#Corpus sentences ngram
sents = corpus.sents()

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn', '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']

gramslist= []
n = 1
for sent in sents:
    sent = [x for x in sent if x not in stop_words]
    fourgrams = ngrams(sent, n)
    for grams in fourgrams:
        gramslist.append(grams)

def document_features(document):
    document_words = set(document)
    features = {}
    for word in gramslist:
        features['contains({})'.format(word)] = (word in document_words)
    return features

random.shuffle(documents)
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[102:], featuresets[:102]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(10))