Scraping Comments From Instagram - Alternative Approach

# What is this? I built this tool as a workaround for Instagram's strong anti-bot measures. It turns screen recordings of Instagram comments into structured data by extracting frames from videos, applying OCR to the images, and filtering the results into clean text. The system processes the video in chunks, identifies the comment regions, and compiles everything into a CSV. You can use this to analyze trends/narratives, collect feedback on products, or gather training data for sentiment analysis models. This is more of a proof-of-concept vs. a tool that would be used in production. The idea is to sidestep Instagram's anti-bot technology to convert engagement into digestible data. # Code ```python import cv2 import pytesseract import re import argparse import os import csv from PIL import Image from datetime import datetime import numpy as np def filter_line(line): line_stripped = line.strip().lower() if not line_stripped: return False if line_stripped.startswith("view ") or line_stripped == "reply": return False if re.match(r'^\d+, line_stripped): return False return True def extract_comments_from_text(text): text_lines = [] for line in text.split('\n'): line = line.strip() if not line: continue if line.lower() in ['reply', 'like']: continue if re.match(r'^\d+, line.strip()): continue text_lines.append(line) return text_lines def process_image_for_comments(img): height, width = img.shape[:2] y1 = 200 y2 = int(height * 0.65) x1 = 0 x2 = int(width * 0.85) y2 = min(y2, height) x2 = min(x2, width) cropped = img[y1:y2, x1:x2] scale_factor = 4 cropped_h, cropped_w = cropped.shape[:2] resized_w = int(cropped_w * scale_factor) resized_h = int(cropped_h * scale_factor) resized = cv2.resize(cropped, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR) custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1' text = pytesseract.image_to_string(Image.fromarray(resized), lang='eng', config=custom_config) return extract_comments_from_text(text) def extract_frames_from_video(video_path, output_folder, sample_rate=1): os.makedirs(output_folder, exist_ok=True) video = cv2.VideoCapture(video_path) if not video.isOpened(): raise ValueError(f"Could not open video file: {video_path}") fps = video.get(cv2.CAP_PROP_FPS) frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) duration = frame_count / fps print(f"Video information:") print(f"- Duration: {duration:.2f} seconds") print(f"- Frame rate: {fps:.2f} fps") print(f"- Total frames: {frame_count}") print(f"- Sampling every {sample_rate} frame(s)") estimated_frames = frame_count // sample_rate print(f"Will extract approximately {estimated_frames} frames") frame_paths = [] frame_index = 0 extracted_count = 0 while True: success, frame = video.read() if not success: break if frame_index % sample_rate == 0: frame_filename = os.path.join(output_folder, f"frame_{extracted_count:05d}.jpg") cv2.imwrite(frame_filename, frame) frame_paths.append(frame_filename) extracted_count += 1 if extracted_count % 10 == 0: print(f"Extracted {extracted_count} frames so far...") frame_index += 1 video.release() print(f"Finished extracting {extracted_count} frames from video") return frame_paths def process_frames_to_single_row(frame_paths, video_filename, csv_path): seen_comments = [] file_exists = os.path.isfile(csv_path) if not file_exists: with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(['VideoSource', 'Comments']) total_frames = len(frame_paths) for i, frame_path in enumerate(frame_paths): print(f"Processing frame {i+1}/{total_frames}: {frame_path}") img = cv2.imread(frame_path) if img is None: print(f"Warning: Could not read frame {frame_path}") continue comments = process_image_for_comments(img) new_comments_added = False for comment in comments: if comment not in seen_comments: seen_comments.append(comment) new_comments_added = True if new_comments_added: with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(['VideoSource', 'Comments']) all_comments_text = "\n\n\n".join(seen_comments) csv_writer.writerow([video_filename, all_comments_text]) print(f"Updated CSV with new comments (total: {len(seen_comments)})") if (i+1) % 5 == 0 or (i+1) == total_frames: print(f"Progress: {i+1}/{total_frames} frames processed") print(f"Found {len(seen_comments)} unique comments so far") print(f"Complete! Extracted {len(seen_comments)} unique comments from {total_frames} frames") print(f"Results saved to {csv_path}") return len(seen_comments) def process_directory_of_videos(directory_path, output_csv, frames_dir_base, sample_rate): video_extensions = ['.mp4', '.mov', '.m4v', '.avi'] video_files = [] for file in os.listdir(directory_path): if any(file.lower().endswith(ext) for ext in video_extensions): video_files.append(os.path.join(directory_path, file)) if not video_files: print(f"No video files found in {directory_path}") return print(f"Found {len(video_files)} video files to process") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") csv_path = f"{os.path.splitext(output_csv)[0]}_{timestamp}.csv" for i, video_path in enumerate(video_files): video_filename = os.path.basename(video_path) print(f"\nProcessing video {i+1}/{len(video_files)}: {video_filename}") video_frames_dir = f"{frames_dir_base}_{timestamp}_{i}" try: print("\nStep 1: Extracting frames from video...") frame_paths = extract_frames_from_video(video_path, video_frames_dir, sample_rate) print("\nStep 2: Processing frames for comments...") comment_count = process_frames_to_single_row(frame_paths, video_filename, csv_path) print(f"\n✅ Successfully processed video {video_filename}") print(f" Extracted {comment_count} unique comments") except Exception as e: print(f"\n❌ Error processing video {video_filename}: {str(e)}") continue print(f"\nAll videos processed. Results saved to {csv_path}") def main(): parser = argparse.ArgumentParser(description="Extract Instagram comments from screen recordings") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--video", "-v", help="Path to a single Instagram screen recording video") group.add_argument("--directory", "-d", help="Path to a directory containing multiple videos") parser.add_argument("--output", "-o", default="instagram_comments.csv", help="Output CSV file to save extracted comments") parser.add_argument("--frames-dir", "-f", default="extracted_frames", help="Base directory to save extracted frames") parser.add_argument("--sample-rate", "-s", type=int, default=30, help="Sample rate for frame extraction (higher number = fewer frames)") args = parser.parse_args() print("=" * 50) print("Instagram Comment Extraction Pipeline") print("=" * 50) try: if args.video: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") frames_dir = f"{args.frames_dir}_{timestamp}" csv_path = f"{os.path.splitext(args.output)[0]}_{timestamp}.csv" print("\nStep 1: Extracting frames from video...") frame_paths = extract_frames_from_video(args.video, frames_dir, args.sample_rate) print("\nStep 2: Processing frames for comments...") video_filename = os.path.basename(args.video) comment_count = process_frames_to_single_row(frame_paths, video_filename, csv_path) print(f"\n✅ Process completed successfully!") print(f"Extracted {comment_count} comments from {video_filename}") print(f"CSV file with extracted comments: {csv_path}") else: process_directory_of_videos( args.directory, args.output, args.frames_dir, args.sample_rate ) except Exception as e: print(f"\n❌ Error: {str(e)}") raise if __name__ == "__main__": main() ```