Header

๐‚๐‡๐ˆ๐๐“๐”❤️

Kash tu chaand aur mai sitarah hota

Aasmaan mein aashiyana hamara hota,


Log tumhe door se dekhte,

Pas se dekhne ka haq sirf hamara hota❣️❣️❣️❣️





import nltk,re

import Levenshtein

from nltk.metrics.distance import edit_distance


import string

import pandas as pd

import re, nltk,string

import nltk

nltk.download('punkt')

nltk.download('stopwords')

nltk.download('maxent_ne_chunker')

nltk.download('words')

nltk.download('averaged_perceptron_tagger')

nltk.download('stopwords')

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize,sent_tokenize

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize, sent_tokenize

from nltk.probability import FreqDist

import matplotlib.pyplot as plt

from collections import Counter

# Download necessary NLTK data

nltk.download('punkt')

nltk.download('stopwords')

from nltk import word_tokenize,sent_tokenize ,ne_chunk,pos_tag



paragraph = "i. It is a nice night.ii. This crap game is over a garage in Fifty-second Street... 3iii. …Nobody ever takes the newspapers she sells ...iv. …I am sitting in Mindy’s restaurant putting on the gefillte fish, which is a dish I am very fond of, ...v. The quick brown fox jumps over the lazy dog."

paragraph=paragraph.lower()


paragraph = nltk.sent_tokenize(paragraph)

# print(paragraph)

okk=[]

for i in paragraph:

    # print(i)

    ok=nltk.word_tokenize(i)

    for jj in ok:


        okk.append(jj)



from spellchecker import SpellChecker


def find_misspelled_words(words):

    spell = SpellChecker()

    misspelled = spell.unknown(words)

    

    return misspelled


misspelled_words = find_misspelled_words(okk)

print(misspelled_words)


import Levenshtein as lev

from spellchecker import SpellChecker


spell = SpellChecker()


# def suggest_corrections(misspelled_words):

#     suggestions = {}

#     for word in misspelled_words:

#         # Get close matches

#         candidates = spell.candidates(word)

#         # Suggest based on Levenshtein distance

#         suggestions[word] = sorted(candidates, key=lambda candidate: lev.distance(word, candidate))

#     return suggestions


# corrections = suggest_corrections(misspelled_words)

# # print(misspelled_count)

# print(corrections)




#before cleaning 

print(len(okk))

from nltk.stem.porter import PorterStemmer


stemming_text = []


for i in okk:

    stemming_text.append(PorterStemmer().stem(i))

    


print(stemming_text)


word_freq = Counter(okk)

print(word_freq)


# Get the 15 most common words

most_common_words = word_freq.most_common(15)

words = []

counts = []

for word, count in most_common_words:

    words.append(word)

    counts.append(count)

# print(counts)

# Plot frequency distribution

plt.figure(figsize=(10,6))

plt.bar(words, counts)

plt.title('Top 15 Most Common Words')

plt.xlabel('Words')

plt.ylabel('Frequency')

plt.xticks(rotation=45)

plt.show()



from wordcloud import WordCloud

word_string = ' '.join(okk)


# Generate word cloud

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(word_string)


# Display the word cloud

plt.figure(figsize=(10, 6))

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis('off')

plt.show()


mostcommonword = Counter(okk)

print(mostcommonword)

uniqueword = set(okk)


print(len(uniqueword))


##################################################################### cleaning start



# paragraph=paragraph.translate(paragraph.maketrans("","",string.punctuation))

# print(paragraph)

# print()

# sentences = sent_tokenize(paragraph)

# from nltk.corpus import stopwords

# filterword =[]


# stp = nltk.corpus.stopwords.words("english")

# for kk in sentences:

#     print(kk)

#     tokk = nltk.word_tokenize(kk)

#     print(tokk)

#     print()

    

#     print()

#     for i in tokk:


#         if i.isalpha():

#             if i  not in stp:

#                 filterword.append(i)


        




# # print(sentences)

# print(filterword)



# tagg = []

# postt =nltk.pos_tag(filterword)


# tagg.append(postt)

# print(tagg)

# print()

# print()


# ner_result = ne_chunk(postt)

# print(ner_result)


# print()

# print()

# print()

# print()

# print()

# print()

# print()



# # now i will count the words


# paragraph = nltk.sent_tokenize(paragraph)






s1 = "blokchn"


s2 = "blockchain"



edit_distanc = Levenshtein.distance(s1,s2)

print("the edit distance between'{}' and this '{}' is '{}'".format(s1,s2,edit_distanc))

No comments:

Post a Comment

Keep it concise

Chintu ❤️

เค…เคฌ เคจเคนीं...❤️

เคธुเคงเคฐ เคธुเคงเคฐ เค•े เคธुเคงเคฐा เคนूँ  เคฎैं เคซ़िเคฐ เคธे เคฌिเค—เคก़ เคœाเคŠँเค—ा  เคคुเคฎ เคชूเค›ोเค—े เคนाเคฒ เคฎेเคฐा  เคฎैं เค‡เคถ्เค•़ เคฎें เคชเคก़ เคœाเคŠँเค—ा