package Lingua::ES::Hyphenate;

use strict;
use warnings;

require Exporter;

our @ISA = qw(Exporter);

our %EXPORT_TAGS = ( 'all' => [ qw(
	hyphenate
	syllable_cnt
) ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

our @EXPORT = qw(
	hyphenate
	syllable_cnt
);

our $VERSION = '.01';

=head1 NAME

Lingua::ES::Hyphenate - Separates Spanish words into syllables

=head1 SYNOPSIS

  use Lingua::ES::Hyphenate;

  @syllables = hyphenate('oportunidad')# @syllables now holds ('o','por','tu','ni','dad')

  # or

  $word = new Lingua::ES::Hyphenate->new('oportunidad');
  @syllables = $word->hyphenate;

=head1 DESCRIPTION

	Separates Spanish words into syllables.

=head1 SPANISH SYLLABLE STRUCTURE

	<From Wikipedia>
	The Spanish syllable structure can be summarized as follows: C1 C2 S1 V S2 C3 C4
	Spanish syllable structure allows a maximum of two consonants in its onset,
	a nucleus of a vowel followed by and/or preceded by a semivowel,
	and a maximum of two consonants in its coda.
	The following restrictions apply:
	Onset
		First consonant (C1): Can be any consonant.
		Second consonant (C2): If and only if the first consonant is a plosive 
		/p, t, k, b, d, g/ or a voiceless labiodental fricative /f/, 
		then the second consonant can be a liquid /l, r/. 
		Although they occur, the onsets /tl/ and /dl/ are not native to Spanish.
	Nucleus
		Semivowel (S1)
		Vowel (V)
		Semivowel (S2)
	Coda
		First consonant (C3): Can be any consonant.
		Second consonant (C4): Must be /s/.
		
=head1 SEE ALSO

	http://en.wikipedia.org/wiki/Spanish_phonology#Phonotactics
	
=cut

my $cnt;# global variable for number of syllables in last parsed word

my $letters = qr/[A�BCDE�FGHI�JKLMN��OPQRSTU�VWXYZ]/i;# Apparently perl doesn't know that � is lowercase for �
#prevent backtracking here; otherwise two letter consonants won't work.
my $anyCons = qr/(?>RR|LL|CH|QU|[BCDFGHJKLMN�PQRSTVWXYZ])/i;# any consonant


my $preR = qr/[PKCBGFTD]/i; # These may precede R in an onset
my $preL = qr/[PKCBGF]/i;	# These may precede L in an onset
my $C2 = qr/
	(?<=^$preR)L			# At the beginning of a word, a TL or DL (loan words)
		|					# or
	(?<=$preR)R				# PR KR CR BR GR FR TR DR
		|					# OR
	(?<=$preL)L				# PL KL CL BL GL FL
/ix;#
my $onset = qr/$anyCons$C2?/i;# C2 is optional

my $semiVowel = qr/[UI]/i;
my $vowel = qr/[A�E�O���]/i;
my $allVows = qr/[UIA�E�O���]/i;
my $nucleus = qr/(?:$semiVowel?$vowel$semiVowel?)|$semiVowel/i;

my $coda = qr/${anyCons}S?/i;# separate $C4 variable seemed worthless.

my $syllable = qr/
	$onset? # onsets are optional
	$nucleus # nuclei are not optional
	(?: $coda
		# We must make sure that the letters after the coda cannot be an
		# onset to another syllable; if they are, we forget the coda and
		# parse the next consonants as the onset of the next syllable.
		(?(?<=$preL)			# IF the matched $coda was a pre L consonant
			(?!L)				# don't match a following L
		)
		(?(?<=$preR)			# IF the matched $coda was a pre R consonant
			(?!R)				# don't match a following R
		)
		(?!$allVows)			# don't match a following vowel or semivowel
	)? # coda is optional
/ix;# ignore case

=head1 CONSTRUCTOR

	Not necessary, since functions are exported.
	
	my $hyphenater = Lingua::ES::Hyphenate->new('charlar');
	
=cut

sub new {
  my ($self, $word) = @_;
  bless \$word, $self;
}

=head1 hyphenate

	Returns array of syllables from input word.
	
	my $hyphenater = Lingua::ES::Hyphenate->new('charlar');
	@syllabes = $hyphenater->hyphenate();
	
	# or
	
	@syllables = hyphenate('tomarlo')
	
=cut

sub hyphenate {
	$_[0] || return ();

	my $word;
	if (ref($_[0]) eq 'Lingua::ES::Hyphenate')
	{
		my $self = shift;
		$word = $$self;
	}
	else 
	{
		$word = shift;
	}
	$word =~ /^$letters+$/ || return ();
	$cnt = $word =~ s/$syllable/$&=/g;
	split '=', $word;
}

=head1 syllable_cnt

	Returns number of syllables in string argument.
	If no argument is provided, returns the number of
	syllables in the last word parsed.
	
	my $cnt = syllable_cnt('tomarlo');
	
	# or
	
	my $hyphenater = Lingua::ES::Hyphenate->new('charlar');
	my $cnt = $hyphenater->syllable_cnt('escuela');

	# or
	
	my @syllables = hyphenate('majader�as');
	$cnt = syllable_cnt();
	# same as
	$cnt = @syllables;
	
=cut

sub syllable_cnt{
	my $word = '';
	if (ref($_[0]) eq 'Lingua::ES::Hyphenate')
	{
		my $self = shift;
		$word = $$self;
		$cnt = $word =~ s/$syllable//g;
		return $cnt;
	}
	elsif(@_ == 1)
	{
		$word = shift;
	}
	if($word ne '')
	{
		$cnt = $word =~ s/$syllable//g;
		return $cnt;
	}
	return $cnt; # default: return number of syllables in last word
}

1;
=head1 AUTHOR

Nathan Glenn, <garfieldnate@gmail.com>

=head1 COPYRIGHT AND LICENSE

Copyright 2010 by Nathan Glenn

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. 

=head1 NEEDS WORK

	Atlanta splits as 'A-tlan-ta'. Is that correct? 'tl' and 'dl' and not
	native sounds, and Atlanta is a lone word, so maybe it's okay.
	'At-lan-ta' seems more natural to me.

=cut