Kash tu chaand aur mai sitarah hota
Aasmaan mein aashiyana hamara hota,
Log tumhe door se dekhte,
Pas se dekhne ka haq sirf hamara hota❣️❣️❣️❣️
import Levenshtein
from nltk.metrics.distance import edit_distance
import string
import pandas as pd
import re, nltk,string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from collections import Counter
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
from nltk import word_tokenize,sent_tokenize ,ne_chunk,pos_tag
paragraph = "i. It is a nice night.ii. This crap game is over a garage in Fifty-second Street... 3iii. …Nobody ever takes the newspapers she sells ...iv. …I am sitting in Mindy’s restaurant putting on the gefillte fish, which is a dish I am very fond of, ...v. The quick brown fox jumps over the lazy dog."
paragraph=paragraph.lower()
paragraph = nltk.sent_tokenize(paragraph)
# print(paragraph)
okk=[]
for i in paragraph:
# print(i)
ok=nltk.word_tokenize(i)
for jj in ok:
okk.append(jj)
from spellchecker import SpellChecker
def find_misspelled_words(words):
spell = SpellChecker()
misspelled = spell.unknown(words)
return misspelled
misspelled_words = find_misspelled_words(okk)
print(misspelled_words)
import Levenshtein as lev
from spellchecker import SpellChecker
spell = SpellChecker()
# def suggest_corrections(misspelled_words):
# suggestions = {}
# for word in misspelled_words:
# # Get close matches
# candidates = spell.candidates(word)
# # Suggest based on Levenshtein distance
# suggestions[word] = sorted(candidates, key=lambda candidate: lev.distance(word, candidate))
# return suggestions
# corrections = suggest_corrections(misspelled_words)
# # print(misspelled_count)
# print(corrections)
#before cleaning
print(len(okk))
from nltk.stem.porter import PorterStemmer
stemming_text = []
for i in okk:
stemming_text.append(PorterStemmer().stem(i))
print(stemming_text)
word_freq = Counter(okk)
print(word_freq)
# Get the 15 most common words
most_common_words = word_freq.most_common(15)
words = []
counts = []
for word, count in most_common_words:
words.append(word)
counts.append(count)
# print(counts)
# Plot frequency distribution
plt.figure(figsize=(10,6))
plt.bar(words, counts)
plt.title('Top 15 Most Common Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
from wordcloud import WordCloud
word_string = ' '.join(okk)
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(word_string)
# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
mostcommonword = Counter(okk)
print(mostcommonword)
uniqueword = set(okk)
print(len(uniqueword))
##################################################################### cleaning start
# paragraph=paragraph.translate(paragraph.maketrans("","",string.punctuation))
# print(paragraph)
# print()
# sentences = sent_tokenize(paragraph)
# from nltk.corpus import stopwords
# filterword =[]
# stp = nltk.corpus.stopwords.words("english")
# for kk in sentences:
# print(kk)
# tokk = nltk.word_tokenize(kk)
# print(tokk)
# print()
# print()
# for i in tokk:
# if i.isalpha():
# if i not in stp:
# filterword.append(i)
# # print(sentences)
# print(filterword)
# tagg = []
# postt =nltk.pos_tag(filterword)
# tagg.append(postt)
# print(tagg)
# print()
# print()
# ner_result = ne_chunk(postt)
# print(ner_result)
# print()
# print()
# print()
# print()
# print()
# print()
# print()
# # now i will count the words
# paragraph = nltk.sent_tokenize(paragraph)
s1 = "blokchn"
s2 = "blockchain"
edit_distanc = Levenshtein.distance(s1,s2)
print("the edit distance between'{}' and this '{}' is '{}'".format(s1,s2,edit_distanc))
No comments:
Post a Comment
Keep it concise