import nltk, requests from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from gensim import corpora from gensim.models import LdaModel from wordcloud import WordCloud import matplotlib.pyplot as plt # Define the path to your text file url = 'http://ximarketing.github.io/data/AER_abstracts.txt' response = requests.get(url) documents = response.text.split('\n') # Preprocess the documents stop_words = set(stopwords.words('english')) def preprocess(text): tokens = word_tokenize(text.lower()) tokens = [word for word in tokens if word.isalnum()] tokens = [word for word in tokens if word not in stop_words] return tokens processed_documents = [preprocess(doc) for doc in documents] # Create a dictionary representation of the documents dictionary = corpora.Dictionary(processed_documents) # Create a corpus: a list of bags of words corpus = [dictionary.doc2bow(doc) for doc in processed_documents] # Set the number of topics num_topics = 12 # Train the LDA model lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15) # Display the topics for idx, topic in lda_model.print_topics(-1): print(f'Topic: {idx} \nWords: {topic}\n') def generate_wordclouds(lda_model, num_topics, nrows, ncols): fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20)) for i in range(num_topics): topic_words = lda_model.show_topic(i, topn=12) word_freq = {word: freq for word, freq in topic_words} wc = WordCloud(background_color="white", width=800, height=400).generate_from_frequencies(word_freq) row = i // ncols col = i % ncols axs[row, col].imshow(wc, interpolation='bilinear') axs[row, col].set_title(f"Topic {i}") axs[row, col].axis('off') plt.tight_layout() plt.show() # Generate word clouds for 30 topics in a 5 by 6 matrix nrows = 3 ncols = 4 generate_wordclouds(lda_model, num_topics, nrows, ncols)