import numpy as np

import nltk

PATH = 'D:/GitHub/machine-learning-notebooks/Natural-Language-Processing/nltk_data'
nltk.data.path.append(PATH)

nltk.download(download_dir=PATH)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml

True

text = 'Hello everyone, how are you all? This is an example of text, which will be tokenized in several ways. Thank you!'

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)
for sentence in sentences:
    print(sentence)

Hello everyone, how are you all?
This is an example of text, which will be tokenized in several ways.
Thank you!

from nltk.tokenize import word_tokenize

words = word_tokenize(text)
for word in words:
    print(word)

Hello
everyone
,
how
are
you
all
?
This
is
an
example
of
text
,
which
will
be
tokenized
in
several
ways
.
Thank
you
!

tags = nltk.pos_tag(words)
print(tags)

[('Hello', 'NNP'), ('everyone', 'NN'), (',', ','), ('how', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('all', 'DT'), ('?', '.'), ('This', 'DT'), ('is', 'VBZ'), ('an', 'DT'), ('example', 'NN'), ('of', 'IN'), ('text', 'NN'), (',', ','), ('which', 'WDT'), ('will', 'MD'), ('be', 'VB'), ('tokenized', 'VBN'), ('in', 'IN'), ('several', 'JJ'), ('ways', 'NNS'), ('.', '.'), ('Thank', 'NNP'), ('you', 'PRP'), ('!', '.')]

rule = r'Chunk: {<NN[SP]*.?>+<.>}'
parser = nltk.RegexpParser(rule)
chunk = parser.parse(tags)

chunk.draw()

from nltk.stem import PorterStemmer

words = ['tokenization', 'running', 'pythonic', 'understandable', 'avoidable', 'memorable']
PS = PorterStemmer()
for word in words:
        print(f'{word} -> {PS.stem(word)}')

tokenization -> token
running -> run
pythonic -> python
understandable -> understand
avoidable -> avoid
memorable -> memor

from nltk.stem import WordNetLemmatizer

words = ['children', 'feet', 'wolves', 'indices', 'leaves', 'mice', 'phenomena']
WL = WordNetLemmatizer()
for word in words:
        print(f'{word} -> {WL.lemmatize(word)}')

children -> child
feet -> foot
wolves -> wolf
indices -> index
leaves -> leaf
mice -> mouse
phenomena -> phenomenon

number	tag	description
1	CC	Coordinating conjunction
2	CD	Cardinal number
3	DT	Determiner
4	EX	Existential there
5	FW	Foreign word
6	IN	Preposition or subordinating conjunction
7	JJ	Adjective
8	JJR	Adjective, comparative
9	JJS	Adjective, superlative
10	LS	List item marker
11	MD	Modal
12	NN	Noun, singular or mass
13	NNS	Noun, plural
14	NNP	Proper noun, singular
15	NNPS	Proper noun, plural
16	PDT	Predeterminer
17	POS	Possessive ending
18	PRP	Personal pronoun
19	PRP\$	Possessive pronoun
20	RB	Adverb
21	RBR	Adverb, comparative
22	RBS	Adverb, superlative
23	RP	Particle
24	SYM	Symbol
25	TO	to
26	UH	Interjection
27	VB	Verb, base form
28	VBD	Verb, past tense
29	VBG	Verb, gerund or present participle
30	VBN	Verb, past participle
31	VBP	Verb, non-3rd person singular present
32	VBZ	Verb, 3rd person singular present
33	WDT	Wh-determiner
34	WP	Wh-pronoun
35	WP\$	Possessive wh-pronoun
36	WRB	Wh-adverb

Basics [NLTK]¶

Installation and download data files¶

Tokenize¶

Tokenize sentences¶

Tokenize words¶

Penn Part of Speech Tags¶

Chunking¶

Stemming¶

Lemmatization¶