git.maemo.org Git - pkg-perl/blob - deb-src/libperl-critic-perl/libperl-critic-perl-1.088/lib/Perl/Critic/Policy/RegularExpressions/ProhibitComplexRegexes.pm

   1 ##############################################################################
   2 #      $URL: http://perlcritic.tigris.org/svn/perlcritic/trunk/Perl-Critic/lib/Perl/Critic/Policy/RegularExpressions/ProhibitComplexRegexes.pm $
   3 #     $Date: 2008-07-03 10:19:10 -0500 (Thu, 03 Jul 2008) $
   4 #   $Author: clonezone $
   5 # $Revision: 2489 $
   6 ##############################################################################
   7
   8 package Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes;
   9
  10 use 5.006001;
  11 use strict;
  12 use warnings;
  13 use Readonly;
  14
  15 use English qw(-no_match_vars);
  16 use Carp;
  17
  18 use Perl::Critic::Utils qw{ :booleans :severities };
  19 use Perl::Critic::Utils::PPIRegexp qw{ parse_regexp get_match_string get_modifiers };
  20 use base 'Perl::Critic::Policy';
  21
  22 our $VERSION = '1.088';
  23
  24 #-----------------------------------------------------------------------------
  25
  26 Readonly::Scalar my $DESC => q{Split long regexps into smaller qr// chunks};
  27 Readonly::Scalar my $EXPL => [261];
  28
  29 #-----------------------------------------------------------------------------
  30
  31 sub supported_parameters {
  32     return (
  33         {
  34             name            => 'max_characters',
  35             description     =>
  36                 'The maximum number of characters to allow in a regular expression.',
  37             default_string  => '60',
  38             behavior        => 'integer',
  39             integer_minimum => 1,
  40         },
  41     );
  42 }
  43
  44 sub default_severity     { return $SEVERITY_MEDIUM           }
  45 sub default_themes       { return qw( core pbp maintenance ) }
  46 sub applies_to           { return qw(PPI::Token::Regexp::Match
  47                                      PPI::Token::Regexp::Substitute
  48                                      PPI::Token::QuoteLike::Regexp) }
  49
  50 #-----------------------------------------------------------------------------
  51
  52 sub violates {
  53     my ( $self, $elem, undef ) = @_;
  54
  55     # Optimization: if its short enough now, parsing won't make it longer
  56     return if $self->{_max_characters} >= length get_match_string($elem);
  57
  58     # If it has an "x" flag, it might be shorter after comment and whitespace removal
  59     my %modifiers = get_modifiers($elem);
  60     if ($modifiers{x}) {
  61        my $re = parse_regexp($elem);
  62        return if !$re; # syntax error, abort
  63        my $qr = $re->visual;
  64
  65        # HACK: Remove any (?xism:...) wrapper we may have added in the parse process...
  66        $qr =~ s/\A [(][?][xism]+(?:-[xism]+)?: (.*) [)] \z/$1/xms;
  67
  68        # Hack: don't count long \p{...} expressions against us so badly
  69        $qr =~ s/\\[pP][{]\w+[}]/\\p{...}/gmx;
  70
  71        return if $self->{_max_characters} >= length $qr;
  72     }
  73
  74     return $self->violation( $DESC, $EXPL, $elem );
  75 }
  76
  77 1;
  78
  79 __END__
  80
  81 #-----------------------------------------------------------------------------
  82
  83 =pod
  84
  85 =for stopwords BNF Tatsuhiko Miyagawa
  86
  87 =head1 NAME
  88
  89 Perl::Critic::Policy::RegularExpressions::ProhibitComplexRegexes - Split long regexps into smaller C<qr//> chunks.
  90
  91 =head1 AFFILIATION
  92
  93 This Policy is part of the core L<Perl::Critic> distribution.
  94
  95
  96 =head1 DESCRIPTION
  97
  98 Big regexps are hard to read, perhaps even the hardest part of Perl.
  99 A good practice to write digestible chunks of regexp and put them
 100 together.  This policy flags any regexp that is longer than C<N>
 101 characters, where C<N> is a configurable value that defaults to 60.
 102 If the regexp uses the C<x> flag, then the length is computed after
 103 parsing out any comments or whitespace.
 104
 105 =head1 CASE STUDY
 106
 107 As an example, look at the regexp used to match email
 108 addresses in L<Email::Valid::Loose> (tweaked lightly to wrap for POD)
 109
 110     (?x-ism:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]
 111     \000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015
 112     "]*)*")(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[
 113     \]\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n
 114     \015"]*)*")|\.)*\@(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,
 115     ;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]
 116     )(?:\.(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000
 117     -\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)
 118
 119 which is constructed from the following code:
 120
 121     my $esc         = '\\\\';
 122     my $period      = '\.';
 123     my $space       = '\040';
 124     my $open_br     = '\[';
 125     my $close_br    = '\]';
 126     my $nonASCII    = '\x80-\xff';
 127     my $ctrl        = '\000-\037';
 128     my $cr_list     = '\n\015';
 129     my $qtext       = qq/[^$esc$nonASCII$cr_list\"]/; # "
 130     my $dtext       = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/;
 131     my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>;
 132     my $atom_char   = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;# "
 133     my $atom        = qq<$atom_char+(?!$atom_char)>;
 134     my $quoted_str  = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; # "
 135     my $word        = qq<(?:$atom|$quoted_str)>;
 136     my $domain_ref  = $atom;
 137     my $domain_lit  = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>;
 138     my $sub_domain  = qq<(?:$domain_ref|$domain_lit)>;
 139     my $domain      = qq<$sub_domain(?:$period$sub_domain)*>;
 140     my $local_part  = qq<$word(?:$word|$period)*>; # This part is modified
 141     $Addr_spec_re   = qr<$local_part\@$domain>;
 142
 143 If you read the code from bottom to top, it is quite readable.  And,
 144 you can even see the one violation of RFC822 that Tatsuhiko Miyagawa
 145 deliberately put into Email::Valid::Loose to allow periods.  Look for
 146 the C<|\.> in the upper regexp to see that same deviation.
 147
 148 One could certainly argue that the top regexp could be re-written more
 149 legibly with C<m//x> and comments.  But the bottom version is
 150 self-documenting and, for example, doesn't repeat C<\x80-\xff> 18
 151 times.  Furthermore, it's much easier to compare the second version
 152 against the source BNF grammar in RFC 822 to judge whether the
 153 implementation is sound even before running tests.
 154
 155 =head1 CONFIGURATION
 156
 157 This policy allows regexps up to C<N> characters long, where C<N>
 158 defaults to 60.  You can override this to set it to a different number
 159 with the C<max_characters> setting.  To do this, put entries in a
 160 F<.perlcriticrc> file like this:
 161
 162   [RegularExpressions::ProhibitComplexRegexes]
 163   max_characters = 40
 164
 165 =head1 CREDITS
 166
 167 Initial development of this policy was supported by a grant from the Perl Foundation.
 168
 169 =head1 AUTHOR
 170
 171 Chris Dolan <cdolan@cpan.org>
 172
 173 =head1 COPYRIGHT
 174
 175 Copyright (c) 2007-2008 Chris Dolan.  Many rights reserved.
 176
 177 This program is free software; you can redistribute it and/or modify
 178 it under the same terms as Perl itself.  The full text of this license
 179 can be found in the LICENSE file included with this module
 180
 181 =cut
 182
 183 # Local Variables:
 184 #   mode: cperl
 185 #   cperl-indent-level: 4
 186 #   fill-column: 78
 187 #   indent-tabs-mode: nil
 188 #   c-indentation-style: bsd
 189 # End:
 190 # ex: set ts=8 sts=4 sw=4 tw=78 ft=perl expandtab shiftround :