import nltk, requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Define the path to your text file
url = 'http://ximarketing.github.io/data/AER_abstracts.txt'
response = requests.get(url) 
documents = response.text.split('\n')
# Preprocess the documents
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_documents)

# Create a corpus: a list of bags of words
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Set the number of topics
num_topics = 12

# Train the LDA model
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Display the topics
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')

def generate_wordclouds(lda_model, num_topics, nrows, ncols):
    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20))
    for i in range(num_topics):
        topic_words = lda_model.show_topic(i, topn=12)
        word_freq = {word: freq for word, freq in topic_words}
        wc = WordCloud(background_color="white", width=800, height=400).generate_from_frequencies(word_freq)
        row = i // ncols
        col = i % ncols
        axs[row, col].imshow(wc, interpolation='bilinear')
        axs[row, col].set_title(f"Topic {i}")
        axs[row, col].axis('off')

    plt.tight_layout()
    plt.show()

# Generate word clouds for 30 topics in a 5 by 6 matrix
nrows = 3
ncols = 4
generate_wordclouds(lda_model, num_topics, nrows, ncols)