MediaWiki  1.33.0
captcha.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 #
3 # Script to generate distorted text images for a captcha system.
4 #
5 # Copyright (C) 2005 Neil Harris
6 #
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License along
18 # with this program; if not, write to the Free Software Foundation, Inc.,
19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
21 #
22 # Further tweaks by Brion Vibber <brion@pobox.com>:
23 # 2006-01-26: Add command-line options for the various parameters
24 # 2007-02-19: Add --dirs param for hash subdirectory splits
25 # Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
26 # 2008-01-06: Add regex check to skip words containing other than a-z
27 
28 import random
29 import math
30 import hashlib
31 from optparse import OptionParser
32 import os
33 import sys
34 import re
35 import multiprocessing
36 import time
37 
38 try:
39  from PIL import Image
40  from PIL import ImageFont
41  from PIL import ImageDraw
42  from PIL import ImageEnhance
43  from PIL import ImageOps
44  from PIL import ImageMath
45 except:
46  sys.exit("This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/")
47 
48 nonalpha = re.compile('[^a-z]') # regex to test for suitability of words
49 
50 # Does X-axis wobbly copy, sandwiched between two rotates
51 def wobbly_copy(src, wob, col, scale, ang):
52  x, y = src.size
53  f = random.uniform(4*scale, 5*scale)
54  p = random.uniform(0, math.pi*2)
55  rr = ang+random.uniform(-10, 10) # vary, but not too much
56  int_d = Image.new('RGB', src.size, 0) # a black rectangle
57  rot = src.rotate(rr, Image.BILINEAR)
58  # Do a cheap bounding-box op here to try to limit work below
59  bbx = rot.getbbox()
60  if bbx == None:
61  return src
62  else:
63  l, t, r, b= bbx
64  # and only do lines with content on
65  for i in range(t, b+1):
66  # Drop a scan line in
67  xoff = int(math.sin(p+(i*f/y))*wob)
68  xoff += int(random.uniform(-wob*0.5, wob*0.5))
69  int_d.paste(rot.crop((0, i, x, i+1)), (xoff, i))
70  # try to stop blurring from building up
71  int_d = int_d.rotate(-rr, Image.BILINEAR)
72  enh = ImageEnhance.Sharpness(int_d)
73  return enh.enhance(2)
74 
75 
76 def gen_captcha(text, fontname, fontsize, file_name):
77  """Generate a captcha image"""
78  # white text on a black background
79  bgcolor = 0x0
80  fgcolor = 0xffffff
81  # create a font object
82  font = ImageFont.truetype(fontname,fontsize)
83  # determine dimensions of the text
84  dim = font.getsize(text)
85  # create a new image significantly larger that the text
86  edge = max(dim[0], dim[1]) + 2*min(dim[0], dim[1])
87  im = Image.new('RGB', (edge, edge), bgcolor)
88  d = ImageDraw.Draw(im)
89  x, y = im.size
90  # add the text to the image
91  d.text((x/2-dim[0]/2, y/2-dim[1]/2), text, font=font, fill=fgcolor)
92  k = 2
93  wob = 0.09*dim[1]
94  rot = 45
95  # Apply lots of small stirring operations, rather than a few large ones
96  # in order to get some uniformity of treatment, whilst
97  # maintaining randomness
98  for i in range(k):
99  im = wobbly_copy(im, wob, bgcolor, i*2+3, rot+0)
100  im = wobbly_copy(im, wob, bgcolor, i*2+1, rot+45)
101  im = wobbly_copy(im, wob, bgcolor, i*2+2, rot+90)
102  rot += 30
103 
104  # now get the bounding box of the nonzero parts of the image
105  bbox = im.getbbox()
106  bord = min(dim[0], dim[1])/4 # a bit of a border
107  im = im.crop((bbox[0]-bord, bbox[1]-bord, bbox[2]+bord, bbox[3]+bord))
108 
109  # Create noise
110  nblock = 4
111  nsize = (im.size[0] / nblock, im.size[1] / nblock)
112  noise = Image.new('L', nsize, bgcolor)
113  data = noise.load()
114  for x in range(nsize[0]):
115  for y in range(nsize[1]):
116  r = random.randint(0, 65)
117  gradient = 70 * x / nsize[0]
118  data[x, y] = r + gradient
119  # Turn speckles into blobs
120  noise = noise.resize(im.size, Image.BILINEAR)
121  # Add to the image
122  im = ImageMath.eval('convert(convert(a, "L") / 3 + b, "RGB")', a=im, b=noise)
123 
124  # and turn into black on white
125  im = ImageOps.invert(im)
126 
127  # save the image, in format determined from filename
128  im.save(file_name)
129 
130 def gen_subdir(basedir, md5hash, levels):
131  """Generate a subdirectory path out of the first _levels_
132  characters of _hash_, and ensure the directories exist
133  under _basedir_."""
134  subdir = None
135  for i in range(0, levels):
136  char = md5hash[i]
137  if subdir:
138  subdir = os.path.join(subdir, char)
139  else:
140  subdir = char
141  fulldir = os.path.join(basedir, subdir)
142  if not os.path.exists(fulldir):
143  os.mkdir(fulldir)
144  return subdir
145 
146 def try_pick_word(words, blacklist, verbose, nwords, min_length, max_length):
147  if words is not None:
148  word = words[random.randint(0,len(words)-1)]
149  while nwords > 1:
150  word2 = words[random.randint(0,len(words)-1)]
151  word = word + word2
152  nwords = nwords - 1
153  else:
154  word = ''
155  max_length = max_length if max_length > 0 else 10
156  for i in range(0, random.randint(min_length, max_length)):
157  word = word + chr(97 + random.randint(0,25))
158 
159  if verbose:
160  print("word is %s" % word)
161 
162  if len(word) < min_length:
163  if verbose:
164  print("skipping word pair '%s' because it has fewer than %d characters" % (word, min_length))
165  return None
166 
167  if max_length > 0 and len(word) > max_length:
168  if verbose:
169  print("skipping word pair '%s' because it has more than %d characters" % (word, max_length))
170  return None
171 
172  if nonalpha.search(word):
173  if verbose:
174  print("skipping word pair '%s' because it contains non-alphabetic characters" % word)
175  return None
176 
177  for naughty in blacklist:
178  if naughty in word:
179  if verbose:
180  print("skipping word pair '%s' because it contains blacklisted word '%s'" % (word, naughty))
181  return None
182  return word
183 
184 def pick_word(words, blacklist, verbose, nwords, min_length, max_length):
185  for x in range(1000): # If we can't find a valid combination in 1000 tries, just give up
186  word = try_pick_word(words, blacklist, verbose, nwords, min_length, max_length)
187  if word:
188  return word
189  sys.exit("Unable to find valid word combinations")
190 
191 def read_wordlist(filename):
192  f = open(filename)
193  words = [x.strip().lower() for x in f.readlines()]
194  f.close()
195  return words
196 
197 def run_in_thread(object):
198  count = object[0];
199  words = object[1]
200  blacklist = object[2]
201  opts = object[3]
202  font = object[4]
203  fontsize = object[5]
204 
205  for i in range(count):
206  word = pick_word(words, blacklist, verbose, opts.number_words, opts.min_length, opts.max_length)
207  salt = "%08x" % random.randrange(2**32)
208  # 64 bits of hash is plenty for this purpose
209  md5hash = hashlib.md5((key+salt+word+key+salt).encode('utf-8')).hexdigest()[:16]
210  filename = "image_%s_%s.png" % (salt, md5hash)
211  if dirs:
212  subdir = gen_subdir(output, md5hash, dirs)
213  filename = os.path.join(subdir, filename)
214  if verbose:
215  print(filename)
216  gen_captcha(word, font, fontsize, os.path.join(output, filename))
217 
218 if __name__ == '__main__':
219  """This grabs random words from the dictionary 'words' (one
220  word per line) and generates a captcha image for each one,
221  with a keyed salted hash of the correct answer in the filename.
222 
223  To check a reply, hash it in the same way with the same salt and
224  secret key, then compare with the hash value given.
225  """
226  script_dir = os.path.dirname(os.path.realpath(__file__))
227  parser = OptionParser()
228  parser.add_option("--wordlist", help="A list of words (required)", metavar="WORDS.txt")
229  parser.add_option("--random", help="Use random charcters instead of a wordlist", action="store_true")
230  parser.add_option("--key", help="The passphrase set as $wgCaptchaSecret (required)", metavar="KEY")
231  parser.add_option("--output", help="The directory to put the images in - $wgCaptchaDirectory (required)", metavar="DIR")
232  parser.add_option("--font", help="The font to use (required)", metavar="FONT.ttf")
233  parser.add_option("--font-size", help="The font size (default 40)", metavar="N", type='int', default=40)
234  parser.add_option("--count", help="The maximum number of images to make (default 20)", metavar="N", type='int', default=20)
235  parser.add_option("--blacklist", help="A blacklist of words that should not be used", metavar="FILE", default=os.path.join(script_dir, "blacklist"))
236  parser.add_option("--fill", help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs", metavar="N", type='int')
237  parser.add_option("--dirs", help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels", metavar="N", type='int')
238  parser.add_option("--verbose", "-v", help="Show debugging information", action='store_true')
239  parser.add_option("--number-words", help="Number of words from the wordlist which make a captcha challenge (default 2)", type='int', default=2)
240  parser.add_option("--min-length", help="Minimum length for a captcha challenge", type='int', default=1)
241  parser.add_option("--max-length", help="Maximum length for a captcha challenge", type='int', default=-1)
242  parser.add_option("--threads", help="Maximum number of threads to be used to generate captchas.", type='int', default=1)
243 
244  opts, args = parser.parse_args()
245 
246  if opts.wordlist:
247  wordlist = opts.wordlist
248  elif opts.random:
249  wordlist = None
250  else:
251  sys.exit("Need to specify a wordlist")
252  if opts.key:
253  key = opts.key
254  else:
255  sys.exit("Need to specify a key")
256  if opts.output:
257  output = opts.output
258  else:
259  sys.exit("Need to specify an output directory")
260  if opts.font and os.path.exists(opts.font):
261  font = opts.font
262  else:
263  sys.exit("Need to specify the location of a font")
264 
265  blacklist = read_wordlist(opts.blacklist)
266  count = opts.count
267  fill = opts.fill
268  dirs = opts.dirs
269  verbose = opts.verbose
270  fontsize = opts.font_size
271  threads = opts.threads
272 
273  if fill:
274  count = max(0, fill - len(os.listdir(output)))
275 
276  words = None
277  if wordlist:
278  words = read_wordlist(wordlist)
279  words = [x for x in words
280  if len(x) in (4,5) and x[0] != "f"
281  and x[0] != x[1] and x[-1] != x[-2]]
282 
283  if count == 0:
284  sys.exit("No need to generate CAPTCHA images.")
285 
286  if count < threads:
287  chunks = 1
288  threads = 1
289  else:
290  chunks = (count / threads)
291 
292  p = multiprocessing.Pool(threads);
293  data = []
294  print("Generating %s CAPTCHA images separated in %s image(s) per chunk run by %s threads..." % (count, chunks, threads))
295  for i in range(0, threads):
296  data.append([chunks, words, blacklist, opts, font, fontsize])
297 
298  p.map(run_in_thread, data)
captcha.gen_subdir
def gen_subdir(basedir, md5hash, levels)
Definition: captcha.py:130
captcha.pick_word
def pick_word(words, blacklist, verbose, nwords, min_length, max_length)
Definition: captcha.py:184
captcha.read_wordlist
def read_wordlist(filename)
Definition: captcha.py:191
captcha.gen_captcha
def gen_captcha(text, fontname, fontsize, file_name)
Definition: captcha.py:76
Makefile.open
open
Definition: Makefile.py:18
captcha.try_pick_word
def try_pick_word(words, blacklist, verbose, nwords, min_length, max_length)
Definition: captcha.py:146
captcha.wobbly_copy
def wobbly_copy(src, wob, col, scale, ang)
Definition: captcha.py:51
captcha.run_in_thread
def run_in_thread(object)
Definition: captcha.py:197