Revision 548

View differences:

org.gvsig.scripting/tags/org.gvsig.scripting-2.3.7/org.gvsig.scripting.app/org.gvsig.scripting.app.mainplugin/src/test/java/org/gvsig/scripting/app/extension/AppTest.java
1
/* gvSIG. Geographic Information System of the Valencian Government
2
 *
3
 * Copyright (C) 2007-2008 Infrastructures and Transports Department
4
 * of the Valencian Government (CIT)
5
 * 
6
 * This program is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU General Public License
8
 * as published by the Free Software Foundation; either version 2
9
 * of the License, or (at your option) any later version.
10
 * 
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 * 
16
 * You should have received a copy of the GNU General Public License
17
 * along with this program; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 
19
 * MA  02110-1301, USA.
20
 * 
21
 */
22

  
23
package org.gvsig.scripting.app.extension;
24

  
25
import junit.framework.Test;
26
import junit.framework.TestCase;
27
import junit.framework.TestSuite;
28

  
29
/**
30
 * Unit test for simple App.
31
 */
32
public class AppTest 
33
    extends TestCase
34
{
35
    /**
36
     * Create the test case
37
     *
38
     * @param testName name of the test case
39
     */
40
    public AppTest( String testName )
41
    {
42
        super( testName );
43
    }
44

  
45
    /**
46
     * @return the suite of tests being tested
47
     */
48
    public static Test suite()
49
    {
50
        return new TestSuite( AppTest.class );
51
    }
52

  
53
    /**
54
     * Rigourous Test :-)
55
     */
56
    public void testApp()
57
    {
58
        assertTrue( true );
59
    }
60
}
org.gvsig.scripting/tags/org.gvsig.scripting-2.3.7/org.gvsig.scripting.app/org.gvsig.scripting.app.mainplugin/src/main/resources-plugin/gvsigsh
1
#!/bin/sh
2

  
3
#set -x
4

  
5
cd $(dirname $0)
6

  
7
if [ -f lib/org.gvsig.andami-2* ] ; then
8
  GVSIG_HOME=`dirname "$0"`
9
else
10
  if [ -f lib/org.gvsig.scripting.main-* ] ; then
11
    GVSIG_HOME=`dirname "$0"`/../../..
12
  else
13
    echo "Can't locate gvSIG installation folder"
14
    exit 1
15
  fi
16
fi
17

  
18
cd "${GVSIG_HOME}"
19
GVSIG_HOME=$PWD
20

  
21
for d in gvSIG/extensiones/org.gvsig.app/lib \
22
  gvSIG/extensiones/org.gvsig.scripting.app.extension/lib \
23
  gvSIG/extensiones/org.gvsig.geometry.app.generalpath/lib \
24
  gvSIG/extensiones/org.gvsig.projection.app.proj4j/lib \
25
  gvSIG/extensiones/org.gvsig.projection.app.jcrs/lib ; 
26
  do
27
  for i in $d/*.jar ; do
28
    LIBRARIES="$LIBRARIES:$i"
29
  done
30
  for i in $d/*.zip ; do
31
    LIBRARIES="$LIBRARIES:$i"
32
  done
33
done
34

  
35
GVSIG_LAUNCHER=org.gvsig.scripting.main.Main
36
GVSIG_CLASSPATH="$LIBRARIES"
37
GVSIG_PARAMS=" --plugins-folder=gvSIG/extensiones $@"
38
. ./gvSIG.sh
39

  
org.gvsig.scripting/tags/org.gvsig.scripting-2.3.7/org.gvsig.scripting.app/org.gvsig.scripting.app.mainplugin/src/main/resources-plugin/plugin-persistence.def
1
<?xml version="1.0"?>
2
<!--
3
Definitions of plugin persistence org.gvsig.scripting.app.mainplugin.
4
 -->
5
<definitions>
6
  <version>1.0.0</version>
7
  <classes>
8
    <class name="org.gvsig.scripting.app.mainplugin">
9
      <description>Persistence of scripting plugin</description>
10
      <fields>
11
        <field name="ComposerUseHisWindowManager" type="boolean" mandatory="true" defaultValue="true">
12
          <description>When true the composer use his window manager instead of the gvSIG Window manager.</description>
13
        </field>
14
      </fields>
15
    </class>
16
  </classes>
17
</definitions>
org.gvsig.scripting/tags/org.gvsig.scripting-2.3.7/org.gvsig.scripting.app/org.gvsig.scripting.app.mainplugin/src/main/resources-plugin/i18n/text.properties
1
_Scripting=Scripting
2
_Show_scripting_launcher=Abrir lanzador de scripts
3
_Show_scripting_composer=Abrir editor de scripts
4
_Show_scripting_jython_console=Abrir consola Jython
5

  
6
User=Usuario
7
System=Sistema
8

  
9
Accept=Aceptar
10
Cancel=Cancelar
11
Apply=Aplicar
12

  
13
Retrieving_data=Recibiendo datos...
14

  
15
File=Fichero
16
New=Nuevo
17
Close_document=Cerrar documento
18
Close_all_documents=Cerrar todos los documentos
19
Run=Ejecutar
20
Save=Guardar
21
Close=Cerrar
22
Close_current_tab=Cerrar pesta\u00f1a actual
23
Open_edition_mode=Abrir modo edici\u00f3n
24
Run_selected_script=Ejecutar script seleccionado
25
Delete_selected_script=Borrar script seleccionado
26
Refresh_directories=Actualizar carpetas
27
Move_files=Mover ficheros
28
Set_file_preferences=Editar preferencias de fichero
29

  
30

  
31
Edit=Editar
32
Cut=Cortar
33
Copy=Copiar
34
Paste=Pegar
35
Select_all=Seleccionar todo
36

  
37
Tools=Herramientas
38
Launcher=Lanzador de scripts
39
Scripting_Launcher=Lanzador de scripts
40
Scripting_Composer=Editor de scripts
41
Jython_Console=Consola Jython
42
Import_JavaDoc=Importar JavaDoc
43
JavaDoc=Ayuda de Java (JavaDoc)
44
Remove_JavaDoc=Eliminar JavaDoc
45
Help_contents=Contenido de la ayuda
46
About_scripts_editor=Acerca del editor de scripts
47
Get_help=Ayuda
48
Help=Ayuda
49
_Package_script=Empaquetar script...
50
_Package_help=Empaquetar ayuda...
51

  
52
Problems=Errores
53
Console=Consola
54

  
55
Description=Descripci\u00f3n
56
Resource=Recurso
57
Location=Ubicaci\u00f3n
58

  
59
no_line=sin l\u00ednea
60
Line=L\u00ednea
61

  
62
Welcome_message=Bienvenido al entorno de scripting v1.0
63

  
64
Error_opening_in_edition_mode_title=Error al abrir fichero
65
Error_opening_in_edition_mode=No se pudo abrir el fichero en modo edici\u00f3n
66

  
67
New_Script=Nuevo elemento
68
New_Script_Description=Crea un nuevo script, di\u00e1logo, proyecto, directorio
69

  
70
Deleting_JavaDocs_successfully_title=Operaci\u00f3n realizada con \u00e9xito
71
Deleting_JavaDocs_successfully=Eliminaci\u00f3n de los JavaDocs exitosa
72

  
73
Import_JavaDoc_title=Importar JavaDocs desde los directorios de sistema al ScriptingFramework JavaDoc
74
Remove_JavaDoc_title=Eliminar JavaDocs del ScriptingFramework JavaDoc
75

  
76
projects_modified_title=Proyectos sin guardar
77
projects_modified=Algunos proyectos han sido modificados.\u00bfSalvar cambios?
78
project_modified=ha sido modificado.\u00bfSalvar cambios?
79

  
80
Delete_all_title=Borrar el proyecto
81
Delete_all_message=Esta acci\u00f3n elimina el directorio y todo su contenido. \u00bfContinuar?
82

  
83
About_us_title=Informaci\u00f3n de todos los contribuidores al proyecto gvSIG
84

  
85
Success=Operaci\u00f3n realizada con \u00e9xito
86
Error=Error
87
Error_renaming=Inesperado error renombrando el fichero
88
Error_moving=Inesperado error moviendo el fichero
89
Error_help=Selecciona una ayuda
90
Rename_succesfully=Renombrado del fichero realizado con \u00e9xito
91
Select_first=Debe seleccionar un fichero primero
92
help_key=No hay ayuda para el texto
93

  
94

  
95
script_filesystem=script en el \u00e1rbol de directorio
96
Move=Mover
97
move_desc=Cambia la ubicaci\u00f3n del fichero seleccionado
98
Rename=Renombrar
99
rename_desc=Cambia el nombre y otras propiedades del fichero seleccionado
100
Moving_succesfully=Reubicaci\u00f3n del fichero realizado con \u00e9xito
101

  
102
Dialog=Di\u00e1logo
103
Code=C\u00f3digo
104
Properties=Propiedades
105

  
106
current_name=Nombre actual
107
new_name=Nuevo name
108
rename_name_title=Cambiar el nombre del fichero
109
more_properties=M\u00e1s propiedades
110

  
111
Javadoc_remove=Selecciona el JavaDoc a eliminar
112
no_Javadoc_remove=No hay JavaDoc para eliminar
113

  
114
Name=Nombre
115
Version=Versi\u00f3n
116
Created_by=Creado por
117
Description=Descripci\u00f3n
118
Type=Tipo
119
Language=Lenguaje
120
Author=Autor
121
Move_from=Mover desde
122
Move_to=Destino
123
Browse=Abrir
124
Import_from=Importar desde
125

  
126
Name_blank=El nombre no puede estar en blanco
127
Name_exists=Este nombre ya existe. Debe elegir otro
128
Import_success=Los JavaDocs se han importado satisfactoriamente
129
JavaDoc_Error=Selecciona JavaDoc a importar
130

  
131
go_to_line_Xhorizontal_ellipsisX=Ir a la l\u00ednea\u2026
132
find_Xhorizontal_ellipsisX=Buscar\u2026
133
replace_Xhorizontal_ellipsisX=Reemplazar\u2026
134
_Searching_autorun_scripts_Xhorizontal_ellipsisX=Buscando scripts de autoarranque...
135
_Running_autorun_script_from_XnameX=Ejecutando script de autoarranque desde {0}
org.gvsig.scripting/tags/org.gvsig.scripting-2.3.7/org.gvsig.scripting.app/org.gvsig.scripting.app.mainplugin/src/main/resources-plugin/i18n/text_en.properties
1
_Scripting=Scripting
2
_Show_scripting_launcher=Show scripting launcher
3
_Show_scripting_composer=Show scripting composer
4
_Show_scripting_jython_console=Show Jython console
5

  
6
User=User
7
System=System
8

  
9
Accept=Accept
10
Cancel=Cancel
11
Apply=Apply
12

  
13
Retrieving_data=Retrieving data...
14

  
15
File=File
16
New=New
17
Close_document=Close document
18
Close_all_documents=Cloase all documents
19
Run=Run
20
Save=Save
21
Close=Close
22
Close_current_tab=Close current tab
23
Open_edition_mode=Open edition mode
24
Run_selected_script=Run selected script
25
Delete_selected_script=Delete selected script
26
Refresh_directories=Refresh directories
27
Move_files=Move files
28
Set_file_preferences=Set file preferences
29
_Running_autorun_script_from_XnameX=Running autorun script from {0}
30

  
31
Edit=Edit
32
Cut=Cut
33
Copy=Copy
34
Paste=Paste
35
Select_all=Select all
36

  
37
Tools=Tools
38
Launcher=Scripting Launcher
39
Scripting_Launcher=Scripting Launcher
40
Scripting_Composer=Scripting Composer
41
Jython_Console=Jython console
42
Import_JavaDoc=Import JavaDoc
43
JavaDoc=Java Help (JavaDoc)
44
Remove_JavaDoc=Remove JavaDoc
45
Help_contents=Help contents
46
About_scripts_editor=About scripts editor
47
Get_help=Help
48
Help=Help
49
_Package_script=Package script...
50
_Package_help=Package help...
51

  
52
Problems=Errors
53
Console=Console
54

  
55
Description=Description
56
Resource=Resource
57
Location=Location
58

  
59
no_line=no line
60
Line=Line
61

  
62
Welcome_message=Welcome to the scripting framework v1.0
63

  
64
Error_opening_in_edition_mode_title=Error opening file in edition mode
65
Error_opening_in_edition_mode=The file can't be opened in edition mode
66

  
67
New_Script=New element
68
New_Script_Description=Create a new script, dialog, project, directory
69

  
70
Deleting_JavaDocs_successfully_title=Operation successful
71
Deleting_JavaDocs_successfully=Deleting JavaDocs successfully
72

  
73
Import_JavaDoc_title=Import JavaDocs from filesystem to ScriptingFramework JavaDoc
74
Remove_JavaDoc_title=Remove JavaDocs from ScriptingFramework JavaDoc
75

  
76
projects_modified_title=Projects without saving
77
projects_modified=Some projects have been modified. Save changes?
78
project_modified=has been modified. Save changes?
79

  
80
Delete_all_title=Delete project
81
Delete_all_message=This action removes the directory and all its content. Continue?
82

  
83
About_us_title=Information about all the gvSIG's contributors
84

  
85
Success=Operation done successfully
86
Error=Error
87
Error_renaming=Unexpected error renaming the file
88
Error_moving=Unexpected error moving the file
89
Error_help=Select help
90
Rename_succesfully=Renaming successfully
91
Select_first=Select a file first
92
help_key=There isn't any help to this text
93

  
94

  
95
script_filesystem=script filesystem
96
Move=Move
97
move_desc=Change location of selected file
98
Rename=Rename
99
rename_desc=Change the name and more properties of the selected file
100
Moving_succesfully=File moved succesfully
101

  
102
Dialog=Dialog
103
Code=Code
104
Properties=Properties
105

  
106
current_name=Current name
107
new_name=New name
108
rename_name_title=Change filename
109
more_properties=more properties
110

  
111
Javadoc_remove=Choose JavaDoc to delete
112
no_Javadoc_remove=There's no JavaDoc to delete
113

  
114
Name=Name
115
Version=Version
116
Created_by=Create by
117
Description=Description
118
Type=Type
119
Language=Language
120
Author=Author
121
Move_from=Move from
122
Move_to=Move to
123
Browse=Browse
124
Import_from=Import from
125

  
126
Name_blank=Name can't be blank
127
Name_exists=This name already exists. Choose another one
128
Import_success=JavaDocs imported successfully
129
JavaDoc_Error=Select JavaDoc to import
130

  
131

  
132
go_to_line_Xhorizontal_ellipsisX=Go To Line\u2026
133
find_Xhorizontal_ellipsisX=Find\u2026
134
replace_Xhorizontal_ellipsisX=Replace\u2026
135

  
136
_Searching_autorun_scripts_Xhorizontal_ellipsisX=Searching autorun scripts...
137
_Running_autorun_script_from_XnameX=Running autorun script from {0}
org.gvsig.scripting/tags/org.gvsig.scripting-2.3.7/org.gvsig.scripting.app/org.gvsig.scripting.app.mainplugin/src/main/resources-plugin/scripting/lib/BeautifulSoup.py
1
"""Beautiful Soup
2
Elixir and Tonic
3
"The Screen-Scraper's Friend"
4
http://www.crummy.com/software/BeautifulSoup/
5

  
6
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7
tree representation. It provides methods and Pythonic idioms that make
8
it easy to navigate, search, and modify the tree.
9

  
10
A well-formed XML/HTML document yields a well-formed data
11
structure. An ill-formed XML/HTML document yields a correspondingly
12
ill-formed data structure. If your document is only locally
13
well-formed, you can use this library to find and process the
14
well-formed part of it.
15

  
16
Beautiful Soup works with Python 2.2 and up. It has no external
17
dependencies, but you'll have more success at converting data to UTF-8
18
if you also install these three packages:
19

  
20
* chardet, for auto-detecting character encodings
21
  http://chardet.feedparser.org/
22
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23
  by stock Python.
24
  http://cjkpython.i18n.org/
25

  
26
Beautiful Soup defines classes for two main parsing strategies:
27

  
28
 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
   language that kind of looks like XML.
30

  
31
 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32
   or invalid. This class has web browser-like heuristics for
33
   obtaining a sensible parse tree in the face of common HTML errors.
34

  
35
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36
the encoding of an HTML or XML document, and converting it to
37
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38

  
39
For more than you ever wanted to know about Beautiful Soup, see the
40
documentation:
41
http://www.crummy.com/software/BeautifulSoup/documentation.html
42

  
43
Here, have some legalese:
44

  
45
Copyright (c) 2004-2010, Leonard Richardson
46

  
47
All rights reserved.
48

  
49
Redistribution and use in source and binary forms, with or without
50
modification, are permitted provided that the following conditions are
51
met:
52

  
53
  * Redistributions of source code must retain the above copyright
54
    notice, this list of conditions and the following disclaimer.
55

  
56
  * Redistributions in binary form must reproduce the above
57
    copyright notice, this list of conditions and the following
58
    disclaimer in the documentation and/or other materials provided
59
    with the distribution.
60

  
61
  * Neither the name of the the Beautiful Soup Consortium and All
62
    Night Kosher Bakery nor the names of its contributors may be
63
    used to endorse or promote products derived from this software
64
    without specific prior written permission.
65

  
66
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77

  
78
"""
79
from __future__ import generators
80

  
81
__author__ = "Leonard Richardson (leonardr@segfault.org)"
82
__version__ = "3.2.1"
83
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
84
__license__ = "New-style BSD"
85

  
86
from sgmllib import SGMLParser, SGMLParseError
87
import codecs
88
import markupbase
89
import types
90
import re
91
import sgmllib
92
try:
93
  from htmlentitydefs import name2codepoint
94
except ImportError:
95
  name2codepoint = {}
96
try:
97
    set
98
except NameError:
99
    from sets import Set as set
100

  
101
#These hacks make Beautiful Soup able to parse XML with namespaces
102
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
104

  
105
DEFAULT_OUTPUT_ENCODING = "utf-8"
106

  
107
def _match_css_class(str):
108
    """Build a RE to match the given CSS class."""
109
    return re.compile(r"(^|.*\s)%s($|\s)" % str)
110

  
111
# First, the classes that represent markup elements.
112

  
113
class PageElement(object):
114
    """Contains the navigational information for some part of the page
115
    (either a tag or a piece of text)"""
116

  
117
    def _invert(h):
118
        "Cheap function to invert a hash."
119
        i = {}
120
        for k,v in h.items():
121
            i[v] = k
122
        return i
123

  
124
    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
125
                                      "quot" : '"',
126
                                      "amp" : "&",
127
                                      "lt" : "<",
128
                                      "gt" : ">" }
129

  
130
    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
131

  
132
    def setup(self, parent=None, previous=None):
133
        """Sets up the initial relations between this element and
134
        other elements."""
135
        self.parent = parent
136
        self.previous = previous
137
        self.next = None
138
        self.previousSibling = None
139
        self.nextSibling = None
140
        if self.parent and self.parent.contents:
141
            self.previousSibling = self.parent.contents[-1]
142
            self.previousSibling.nextSibling = self
143

  
144
    def replaceWith(self, replaceWith):
145
        oldParent = self.parent
146
        myIndex = self.parent.index(self)
147
        if hasattr(replaceWith, "parent")\
148
                  and replaceWith.parent is self.parent:
149
            # We're replacing this element with one of its siblings.
150
            index = replaceWith.parent.index(replaceWith)
151
            if index and index < myIndex:
152
                # Furthermore, it comes before this element. That
153
                # means that when we extract it, the index of this
154
                # element will change.
155
                myIndex = myIndex - 1
156
        self.extract()
157
        oldParent.insert(myIndex, replaceWith)
158

  
159
    def replaceWithChildren(self):
160
        myParent = self.parent
161
        myIndex = self.parent.index(self)
162
        self.extract()
163
        reversedChildren = list(self.contents)
164
        reversedChildren.reverse()
165
        for child in reversedChildren:
166
            myParent.insert(myIndex, child)
167

  
168
    def extract(self):
169
        """Destructively rips this element out of the tree."""
170
        if self.parent:
171
            try:
172
                del self.parent.contents[self.parent.index(self)]
173
            except ValueError:
174
                pass
175

  
176
        #Find the two elements that would be next to each other if
177
        #this element (and any children) hadn't been parsed. Connect
178
        #the two.
179
        lastChild = self._lastRecursiveChild()
180
        nextElement = lastChild.next
181

  
182
        if self.previous:
183
            self.previous.next = nextElement
184
        if nextElement:
185
            nextElement.previous = self.previous
186
        self.previous = None
187
        lastChild.next = None
188

  
189
        self.parent = None
190
        if self.previousSibling:
191
            self.previousSibling.nextSibling = self.nextSibling
192
        if self.nextSibling:
193
            self.nextSibling.previousSibling = self.previousSibling
194
        self.previousSibling = self.nextSibling = None
195
        return self
196

  
197
    def _lastRecursiveChild(self):
198
        "Finds the last element beneath this object to be parsed."
199
        lastChild = self
200
        while hasattr(lastChild, 'contents') and lastChild.contents:
201
            lastChild = lastChild.contents[-1]
202
        return lastChild
203

  
204
    def insert(self, position, newChild):
205
        if isinstance(newChild, basestring) \
206
            and not isinstance(newChild, NavigableString):
207
            newChild = NavigableString(newChild)
208

  
209
        position =  min(position, len(self.contents))
210
        if hasattr(newChild, 'parent') and newChild.parent is not None:
211
            # We're 'inserting' an element that's already one
212
            # of this object's children.
213
            if newChild.parent is self:
214
                index = self.index(newChild)
215
                if index > position:
216
                    # Furthermore we're moving it further down the
217
                    # list of this object's children. That means that
218
                    # when we extract this element, our target index
219
                    # will jump down one.
220
                    position = position - 1
221
            newChild.extract()
222

  
223
        newChild.parent = self
224
        previousChild = None
225
        if position == 0:
226
            newChild.previousSibling = None
227
            newChild.previous = self
228
        else:
229
            previousChild = self.contents[position-1]
230
            newChild.previousSibling = previousChild
231
            newChild.previousSibling.nextSibling = newChild
232
            newChild.previous = previousChild._lastRecursiveChild()
233
        if newChild.previous:
234
            newChild.previous.next = newChild
235

  
236
        newChildsLastElement = newChild._lastRecursiveChild()
237

  
238
        if position >= len(self.contents):
239
            newChild.nextSibling = None
240

  
241
            parent = self
242
            parentsNextSibling = None
243
            while not parentsNextSibling:
244
                parentsNextSibling = parent.nextSibling
245
                parent = parent.parent
246
                if not parent: # This is the last element in the document.
247
                    break
248
            if parentsNextSibling:
249
                newChildsLastElement.next = parentsNextSibling
250
            else:
251
                newChildsLastElement.next = None
252
        else:
253
            nextChild = self.contents[position]
254
            newChild.nextSibling = nextChild
255
            if newChild.nextSibling:
256
                newChild.nextSibling.previousSibling = newChild
257
            newChildsLastElement.next = nextChild
258

  
259
        if newChildsLastElement.next:
260
            newChildsLastElement.next.previous = newChildsLastElement
261
        self.contents.insert(position, newChild)
262

  
263
    def append(self, tag):
264
        """Appends the given tag to the contents of this tag."""
265
        self.insert(len(self.contents), tag)
266

  
267
    def findNext(self, name=None, attrs={}, text=None, **kwargs):
268
        """Returns the first item that matches the given criteria and
269
        appears after this Tag in the document."""
270
        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
271

  
272
    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
273
                    **kwargs):
274
        """Returns all items that match the given criteria and appear
275
        after this Tag in the document."""
276
        return self._findAll(name, attrs, text, limit, self.nextGenerator,
277
                             **kwargs)
278

  
279
    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
280
        """Returns the closest sibling to this Tag that matches the
281
        given criteria and appears after this Tag in the document."""
282
        return self._findOne(self.findNextSiblings, name, attrs, text,
283
                             **kwargs)
284

  
285
    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
286
                         **kwargs):
287
        """Returns the siblings of this Tag that match the given
288
        criteria and appear after this Tag in the document."""
289
        return self._findAll(name, attrs, text, limit,
290
                             self.nextSiblingGenerator, **kwargs)
291
    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
292

  
293
    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
294
        """Returns the first item that matches the given criteria and
295
        appears before this Tag in the document."""
296
        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
297

  
298
    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
299
                        **kwargs):
300
        """Returns all items that match the given criteria and appear
301
        before this Tag in the document."""
302
        return self._findAll(name, attrs, text, limit, self.previousGenerator,
303
                           **kwargs)
304
    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
305

  
306
    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
307
        """Returns the closest sibling to this Tag that matches the
308
        given criteria and appears before this Tag in the document."""
309
        return self._findOne(self.findPreviousSiblings, name, attrs, text,
310
                             **kwargs)
311

  
312
    def findPreviousSiblings(self, name=None, attrs={}, text=None,
313
                             limit=None, **kwargs):
314
        """Returns the siblings of this Tag that match the given
315
        criteria and appear before this Tag in the document."""
316
        return self._findAll(name, attrs, text, limit,
317
                             self.previousSiblingGenerator, **kwargs)
318
    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
319

  
320
    def findParent(self, name=None, attrs={}, **kwargs):
321
        """Returns the closest parent of this Tag that matches the given
322
        criteria."""
323
        # NOTE: We can't use _findOne because findParents takes a different
324
        # set of arguments.
325
        r = None
326
        l = self.findParents(name, attrs, 1)
327
        if l:
328
            r = l[0]
329
        return r
330

  
331
    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
332
        """Returns the parents of this Tag that match the given
333
        criteria."""
334

  
335
        return self._findAll(name, attrs, None, limit, self.parentGenerator,
336
                             **kwargs)
337
    fetchParents = findParents # Compatibility with pre-3.x
338

  
339
    #These methods do the real heavy lifting.
340

  
341
    def _findOne(self, method, name, attrs, text, **kwargs):
342
        r = None
343
        l = method(name, attrs, text, 1, **kwargs)
344
        if l:
345
            r = l[0]
346
        return r
347

  
348
    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
349
        "Iterates over a generator looking for things that match."
350

  
351
        if isinstance(name, SoupStrainer):
352
            strainer = name
353
        # (Possibly) special case some findAll*(...) searches
354
        elif text is None and not limit and not attrs and not kwargs:
355
            # findAll*(True)
356
            if name is True:
357
                return [element for element in generator()
358
                        if isinstance(element, Tag)]
359
            # findAll*('tag-name')
360
            elif isinstance(name, basestring):
361
                return [element for element in generator()
362
                        if isinstance(element, Tag) and
363
                        element.name == name]
364
            else:
365
                strainer = SoupStrainer(name, attrs, text, **kwargs)
366
        # Build a SoupStrainer
367
        else:
368
            strainer = SoupStrainer(name, attrs, text, **kwargs)
369
        results = ResultSet(strainer)
370
        g = generator()
371
        while True:
372
            try:
373
                i = g.next()
374
            except StopIteration:
375
                break
376
            if i:
377
                found = strainer.search(i)
378
                if found:
379
                    results.append(found)
380
                    if limit and len(results) >= limit:
381
                        break
382
        return results
383

  
384
    #These Generators can be used to navigate starting from both
385
    #NavigableStrings and Tags.
386
    def nextGenerator(self):
387
        i = self
388
        while i is not None:
389
            i = i.next
390
            yield i
391

  
392
    def nextSiblingGenerator(self):
393
        i = self
394
        while i is not None:
395
            i = i.nextSibling
396
            yield i
397

  
398
    def previousGenerator(self):
399
        i = self
400
        while i is not None:
401
            i = i.previous
402
            yield i
403

  
404
    def previousSiblingGenerator(self):
405
        i = self
406
        while i is not None:
407
            i = i.previousSibling
408
            yield i
409

  
410
    def parentGenerator(self):
411
        i = self
412
        while i is not None:
413
            i = i.parent
414
            yield i
415

  
416
    # Utility methods
417
    def substituteEncoding(self, str, encoding=None):
418
        encoding = encoding or "utf-8"
419
        return str.replace("%SOUP-ENCODING%", encoding)
420

  
421
    def toEncoding(self, s, encoding=None):
422
        """Encodes an object to a string in some encoding, or to Unicode.
423
        ."""
424
        if isinstance(s, unicode):
425
            if encoding:
426
                s = s.encode(encoding)
427
        elif isinstance(s, str):
428
            if encoding:
429
                s = s.encode(encoding)
430
            else:
431
                s = unicode(s)
432
        else:
433
            if encoding:
434
                s  = self.toEncoding(str(s), encoding)
435
            else:
436
                s = unicode(s)
437
        return s
438

  
439
    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
440
                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
441
                                           + ")")
442

  
443
    def _sub_entity(self, x):
444
        """Used with a regular expression to substitute the
445
        appropriate XML entity for an XML special character."""
446
        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
447

  
448

  
449
class NavigableString(unicode, PageElement):
450

  
451
    def __new__(cls, value):
452
        """Create a new NavigableString.
453

  
454
        When unpickling a NavigableString, this method is called with
455
        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
456
        passed in to the superclass's __new__ or the superclass won't know
457
        how to handle non-ASCII characters.
458
        """
459
        if isinstance(value, unicode):
460
            return unicode.__new__(cls, value)
461
        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
462

  
463
    def __getnewargs__(self):
464
        return (NavigableString.__str__(self),)
465

  
466
    def __getattr__(self, attr):
467
        """text.string gives you text. This is for backwards
468
        compatibility for Navigable*String, but for CData* it lets you
469
        get the string without the CData wrapper."""
470
        if attr == 'string':
471
            return self
472
        else:
473
            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
474

  
475
    def __unicode__(self):
476
        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
477

  
478
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
479
        # Substitute outgoing XML entities.
480
        data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
481
        if encoding:
482
            return data.encode(encoding)
483
        else:
484
            return data
485

  
486
class CData(NavigableString):
487

  
488
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
489
        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
490

  
491
class ProcessingInstruction(NavigableString):
492
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
493
        output = self
494
        if "%SOUP-ENCODING%" in output:
495
            output = self.substituteEncoding(output, encoding)
496
        return "<?%s?>" % self.toEncoding(output, encoding)
497

  
498
class Comment(NavigableString):
499
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
500
        return "<!--%s-->" % NavigableString.__str__(self, encoding)
501

  
502
class Declaration(NavigableString):
503
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504
        return "<!%s>" % NavigableString.__str__(self, encoding)
505

  
506
class Tag(PageElement):
507

  
508
    """Represents a found HTML tag with its attributes and contents."""
509

  
510
    def _convertEntities(self, match):
511
        """Used in a call to re.sub to replace HTML, XML, and numeric
512
        entities with the appropriate Unicode characters. If HTML
513
        entities are being converted, any unrecognized entities are
514
        escaped."""
515
        x = match.group(1)
516
        if self.convertHTMLEntities and x in name2codepoint:
517
            return unichr(name2codepoint[x])
518
        elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
519
            if self.convertXMLEntities:
520
                return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
521
            else:
522
                return u'&%s;' % x
523
        elif len(x) > 0 and x[0] == '#':
524
            # Handle numeric entities
525
            if len(x) > 1 and x[1] == 'x':
526
                return unichr(int(x[2:], 16))
527
            else:
528
                return unichr(int(x[1:]))
529

  
530
        elif self.escapeUnrecognizedEntities:
531
            return u'&amp;%s;' % x
532
        else:
533
            return u'&%s;' % x
534

  
535
    def __init__(self, parser, name, attrs=None, parent=None,
536
                 previous=None):
537
        "Basic constructor."
538

  
539
        # We don't actually store the parser object: that lets extracted
540
        # chunks be garbage-collected
541
        self.parserClass = parser.__class__
542
        self.isSelfClosing = parser.isSelfClosingTag(name)
543
        self.name = name
544
        if attrs is None:
545
            attrs = []
546
        elif isinstance(attrs, dict):
547
            attrs = attrs.items()
548
        self.attrs = attrs
549
        self.contents = []
550
        self.setup(parent, previous)
551
        self.hidden = False
552
        self.containsSubstitutions = False
553
        self.convertHTMLEntities = parser.convertHTMLEntities
554
        self.convertXMLEntities = parser.convertXMLEntities
555
        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
556

  
557
        # Convert any HTML, XML, or numeric entities in the attribute values.
558
        convert = lambda(k, val): (k,
559
                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
560
                                          self._convertEntities,
561
                                          val))
562
        self.attrs = map(convert, self.attrs)
563

  
564
    def getString(self):
565
        if (len(self.contents) == 1
566
            and isinstance(self.contents[0], NavigableString)):
567
            return self.contents[0]
568

  
569
    def setString(self, string):
570
        """Replace the contents of the tag with a string"""
571
        self.clear()
572
        self.append(string)
573

  
574
    string = property(getString, setString)
575

  
576
    def getText(self, separator=u""):
577
        if not len(self.contents):
578
            return u""
579
        stopNode = self._lastRecursiveChild().next
580
        strings = []
581
        current = self.contents[0]
582
        while current is not stopNode:
583
            if isinstance(current, NavigableString):
584
                strings.append(current.strip())
585
            current = current.next
586
        return separator.join(strings)
587

  
588
    text = property(getText)
589

  
590
    def get(self, key, default=None):
591
        """Returns the value of the 'key' attribute for the tag, or
592
        the value given for 'default' if it doesn't have that
593
        attribute."""
594
        return self._getAttrMap().get(key, default)
595

  
596
    def clear(self):
597
        """Extract all children."""
598
        for child in self.contents[:]:
599
            child.extract()
600

  
601
    def index(self, element):
602
        for i, child in enumerate(self.contents):
603
            if child is element:
604
                return i
605
        raise ValueError("Tag.index: element not in tag")
606

  
607
    def has_key(self, key):
608
        return self._getAttrMap().has_key(key)
609

  
610
    def __getitem__(self, key):
611
        """tag[key] returns the value of the 'key' attribute for the tag,
612
        and throws an exception if it's not there."""
613
        return self._getAttrMap()[key]
614

  
615
    def __iter__(self):
616
        "Iterating over a tag iterates over its contents."
617
        return iter(self.contents)
618

  
619
    def __len__(self):
620
        "The length of a tag is the length of its list of contents."
621
        return len(self.contents)
622

  
623
    def __contains__(self, x):
624
        return x in self.contents
625

  
626
    def __nonzero__(self):
627
        "A tag is non-None even if it has no contents."
628
        return True
629

  
630
    def __setitem__(self, key, value):
631
        """Setting tag[key] sets the value of the 'key' attribute for the
632
        tag."""
633
        self._getAttrMap()
634
        self.attrMap[key] = value
635
        found = False
636
        for i in range(0, len(self.attrs)):
637
            if self.attrs[i][0] == key:
638
                self.attrs[i] = (key, value)
639
                found = True
640
        if not found:
641
            self.attrs.append((key, value))
642
        self._getAttrMap()[key] = value
643

  
644
    def __delitem__(self, key):
645
        "Deleting tag[key] deletes all 'key' attributes for the tag."
646
        for item in self.attrs:
647
            if item[0] == key:
648
                self.attrs.remove(item)
649
                #We don't break because bad HTML can define the same
650
                #attribute multiple times.
651
            self._getAttrMap()
652
            if self.attrMap.has_key(key):
653
                del self.attrMap[key]
654

  
655
    def __call__(self, *args, **kwargs):
656
        """Calling a tag like a function is the same as calling its
657
        findAll() method. Eg. tag('a') returns a list of all the A tags
658
        found within this tag."""
659
        return apply(self.findAll, args, kwargs)
660

  
661
    def __getattr__(self, tag):
662
        #print "Getattr %s.%s" % (self.__class__, tag)
663
        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
664
            return self.find(tag[:-3])
665
        elif tag.find('__') != 0:
666
            return self.find(tag)
667
        raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
668

  
669
    def __eq__(self, other):
670
        """Returns true iff this tag has the same name, the same attributes,
671
        and the same contents (recursively) as the given tag.
672

  
673
        NOTE: right now this will return false if two tags have the
674
        same attributes in a different order. Should this be fixed?"""
675
        if other is self:
676
            return True
677
        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
678
            return False
679
        for i in range(0, len(self.contents)):
680
            if self.contents[i] != other.contents[i]:
681
                return False
682
        return True
683

  
684
    def __ne__(self, other):
685
        """Returns true iff this tag is not identical to the other tag,
686
        as defined in __eq__."""
687
        return not self == other
688

  
689
    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
690
        """Renders this tag as a string."""
691
        return self.__str__(encoding)
692

  
693
    def __unicode__(self):
694
        return self.__str__(None)
695

  
696
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
697
                prettyPrint=False, indentLevel=0):
698
        """Returns a string or Unicode representation of this tag and
699
        its contents. To get Unicode, pass None for encoding.
700

  
701
        NOTE: since Python's HTML parser consumes whitespace, this
702
        method is not certain to reproduce the whitespace present in
703
        the original string."""
704

  
705
        encodedName = self.toEncoding(self.name, encoding)
706

  
707
        attrs = []
708
        if self.attrs:
709
            for key, val in self.attrs:
710
                fmt = '%s="%s"'
711
                if isinstance(val, basestring):
712
                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
713
                        val = self.substituteEncoding(val, encoding)
714

  
715
                    # The attribute value either:
716
                    #
717
                    # * Contains no embedded double quotes or single quotes.
718
                    #   No problem: we enclose it in double quotes.
719
                    # * Contains embedded single quotes. No problem:
720
                    #   double quotes work here too.
721
                    # * Contains embedded double quotes. No problem:
722
                    #   we enclose it in single quotes.
723
                    # * Embeds both single _and_ double quotes. This
724
                    #   can't happen naturally, but it can happen if
725
                    #   you modify an attribute value after parsing
726
                    #   the document. Now we have a bit of a
727
                    #   problem. We solve it by enclosing the
728
                    #   attribute in single quotes, and escaping any
729
                    #   embedded single quotes to XML entities.
730
                    if '"' in val:
731
                        fmt = "%s='%s'"
732
                        if "'" in val:
733
                            # TODO: replace with apos when
734
                            # appropriate.
735
                            val = val.replace("'", "&squot;")
736

  
737
                    # Now we're okay w/r/t quotes. But the attribute
738
                    # value might also contain angle brackets, or
739
                    # ampersands that aren't part of entities. We need
740
                    # to escape those to XML entities too.
741
                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
742

  
743
                attrs.append(fmt % (self.toEncoding(key, encoding),
744
                                    self.toEncoding(val, encoding)))
745
        close = ''
746
        closeTag = ''
747
        if self.isSelfClosing:
748
            close = ' /'
749
        else:
750
            closeTag = '</%s>' % encodedName
751

  
752
        indentTag, indentContents = 0, 0
753
        if prettyPrint:
754
            indentTag = indentLevel
755
            space = (' ' * (indentTag-1))
756
            indentContents = indentTag + 1
757
        contents = self.renderContents(encoding, prettyPrint, indentContents)
758
        if self.hidden:
759
            s = contents
760
        else:
761
            s = []
762
            attributeString = ''
763
            if attrs:
764
                attributeString = ' ' + ' '.join(attrs)
765
            if prettyPrint:
766
                s.append(space)
767
            s.append('<%s%s%s>' % (encodedName, attributeString, close))
768
            if prettyPrint:
769
                s.append("\n")
770
            s.append(contents)
771
            if prettyPrint and contents and contents[-1] != "\n":
772
                s.append("\n")
773
            if prettyPrint and closeTag:
774
                s.append(space)
775
            s.append(closeTag)
776
            if prettyPrint and closeTag and self.nextSibling:
777
                s.append("\n")
778
            s = ''.join(s)
779
        return s
780

  
781
    def decompose(self):
782
        """Recursively destroys the contents of this tree."""
783
        self.extract()
784
        if len(self.contents) == 0:
785
            return
786
        current = self.contents[0]
787
        while current is not None:
788
            next = current.next
789
            if isinstance(current, Tag):
790
                del current.contents[:]
791
            current.parent = None
792
            current.previous = None
793
            current.previousSibling = None
794
            current.next = None
795
            current.nextSibling = None
796
            current = next
797

  
798
    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
799
        return self.__str__(encoding, True)
800

  
801
    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
802
                       prettyPrint=False, indentLevel=0):
803
        """Renders the contents of this tag as a string in the given
804
        encoding. If encoding is None, returns a Unicode string.."""
805
        s=[]
806
        for c in self:
807
            text = None
808
            if isinstance(c, NavigableString):
809
                text = c.__str__(encoding)
810
            elif isinstance(c, Tag):
811
                s.append(c.__str__(encoding, prettyPrint, indentLevel))
812
            if text and prettyPrint:
813
                text = text.strip()
814
            if text:
815
                if prettyPrint:
816
                    s.append(" " * (indentLevel-1))
817
                s.append(text)
818
                if prettyPrint:
819
                    s.append("\n")
820
        return ''.join(s)
821

  
822
    #Soup methods
823

  
824
    def find(self, name=None, attrs={}, recursive=True, text=None,
825
             **kwargs):
826
        """Return only the first child of this Tag matching the given
827
        criteria."""
828
        r = None
829
        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
830
        if l:
831
            r = l[0]
832
        return r
833
    findChild = find
834

  
835
    def findAll(self, name=None, attrs={}, recursive=True, text=None,
836
                limit=None, **kwargs):
837
        """Extracts a list of Tag objects that match the given
838
        criteria.  You can specify the name of the Tag and any
839
        attributes you want the Tag to have.
840

  
841
        The value of a key-value pair in the 'attrs' map can be a
842
        string, a list of strings, a regular expression object, or a
843
        callable that takes a string and returns whether or not the
844
        string matches for some custom definition of 'matches'. The
845
        same is true of the tag name."""
846
        generator = self.recursiveChildGenerator
847
        if not recursive:
848
            generator = self.childGenerator
849
        return self._findAll(name, attrs, text, limit, generator, **kwargs)
850
    findChildren = findAll
851

  
852
    # Pre-3.x compatibility methods
853
    first = find
854
    fetch = findAll
855

  
856
    def fetchText(self, text=None, recursive=True, limit=None):
857
        return self.findAll(text=text, recursive=recursive, limit=limit)
858

  
859
    def firstText(self, text=None, recursive=True):
860
        return self.find(text=text, recursive=recursive)
861

  
862
    #Private methods
863

  
864
    def _getAttrMap(self):
865
        """Initializes a map representation of this tag's attributes,
866
        if not already initialized."""
867
        if not getattr(self, 'attrMap'):
868
            self.attrMap = {}
869
            for (key, value) in self.attrs:
870
                self.attrMap[key] = value
871
        return self.attrMap
872

  
873
    #Generator methods
874
    def childGenerator(self):
875
        # Just use the iterator from the contents
876
        return iter(self.contents)
877

  
878
    def recursiveChildGenerator(self):
879
        if not len(self.contents):
880
            raise StopIteration
881
        stopNode = self._lastRecursiveChild().next
882
        current = self.contents[0]
883
        while current is not stopNode:
884
            yield current
885
            current = current.next
886

  
887

  
888
# Next, a couple classes to represent queries and their results.
889
class SoupStrainer:
890
    """Encapsulates a number of ways of matching a markup element (tag or
891
    text)."""
892

  
893
    def __init__(self, name=None, attrs={}, text=None, **kwargs):
894
        self.name = name
895
        if isinstance(attrs, basestring):
896
            kwargs['class'] = _match_css_class(attrs)
897
            attrs = None
898
        if kwargs:
899
            if attrs:
900
                attrs = attrs.copy()
901
                attrs.update(kwargs)
902
            else:
903
                attrs = kwargs
904
        self.attrs = attrs
905
        self.text = text
906

  
907
    def __str__(self):
908
        if self.text:
909
            return self.text
910
        else:
911
            return "%s|%s" % (self.name, self.attrs)
912

  
913
    def searchTag(self, markupName=None, markupAttrs={}):
914
        found = None
915
        markup = None
916
        if isinstance(markupName, Tag):
917
            markup = markupName
918
            markupAttrs = markup
919
        callFunctionWithTagData = callable(self.name) \
920
                                and not isinstance(markupName, Tag)
921

  
922
        if (not self.name) \
923
               or callFunctionWithTagData \
924
               or (markup and self._matches(markup, self.name)) \
925
               or (not markup and self._matches(markupName, self.name)):
926
            if callFunctionWithTagData:
927
                match = self.name(markupName, markupAttrs)
928
            else:
929
                match = True
930
                markupAttrMap = None
931
                for attr, matchAgainst in self.attrs.items():
932
                    if not markupAttrMap:
933
                         if hasattr(markupAttrs, 'get'):
934
                            markupAttrMap = markupAttrs
935
                         else:
936
                            markupAttrMap = {}
937
                            for k,v in markupAttrs:
938
                                markupAttrMap[k] = v
939
                    attrValue = markupAttrMap.get(attr)
940
                    if not self._matches(attrValue, matchAgainst):
941
                        match = False
942
                        break
943
            if match:
944
                if markup:
945
                    found = markup
946
                else:
947
                    found = markupName
948
        return found
949

  
950
    def search(self, markup):
951
        #print 'looking for %s in %s' % (self, markup)
952
        found = None
953
        # If given a list of items, scan it for a text element that
954
        # matches.
955
        if hasattr(markup, "__iter__") \
956
                and not isinstance(markup, Tag):
957
            for element in markup:
958
                if isinstance(element, NavigableString) \
959
                       and self.search(element):
960
                    found = element
961
                    break
962
        # If it's a Tag, make sure its name or attributes match.
963
        # Don't bother with Tags if we're searching for text.
964
        elif isinstance(markup, Tag):
965
            if not self.text:
966
                found = self.searchTag(markup)
967
        # If it's text, make sure the text matches.
968
        elif isinstance(markup, NavigableString) or \
969
                 isinstance(markup, basestring):
970
            if self._matches(markup, self.text):
971
                found = markup
972
        else:
973
            raise Exception, "I don't know how to match against a %s" \
974
                  % markup.__class__
975
        return found
976

  
977
    def _matches(self, markup, matchAgainst):
978
        #print "Matching %s against %s" % (markup, matchAgainst)
979
        result = False
980
        if matchAgainst is True:
981
            result = markup is not None
982
        elif callable(matchAgainst):
983
            result = matchAgainst(markup)
984
        else:
985
            #Custom match methods take the tag as an argument, but all
986
            #other ways of matching match the tag name as a string.
987
            if isinstance(markup, Tag):
988
                markup = markup.name
989
            if markup and not isinstance(markup, basestring):
990
                markup = unicode(markup)
991
            #Now we know that chunk is either a string, or None.
992
            if hasattr(matchAgainst, 'match'):
993
                # It's a regexp object.
994
                result = markup and matchAgainst.search(markup)
995
            elif hasattr(matchAgainst, '__iter__'): # list-like
996
                result = markup in matchAgainst
997
            elif hasattr(matchAgainst, 'items'):
998
                result = markup.has_key(matchAgainst)
999
            elif matchAgainst and isinstance(markup, basestring):
1000
                if isinstance(markup, unicode):
1001
                    matchAgainst = unicode(matchAgainst)
1002
                else:
1003
                    matchAgainst = str(matchAgainst)
1004

  
1005
            if not result:
1006
                result = matchAgainst == markup
1007
        return result
1008

  
1009
class ResultSet(list):
1010
    """A ResultSet is just a list that keeps track of the SoupStrainer
1011
    that created it."""
1012
    def __init__(self, source):
1013
        list.__init__([])
1014
        self.source = source
1015

  
1016
# Now, some helper functions.
1017

  
1018
def buildTagMap(default, *args):
1019
    """Turns a list of maps, lists, or scalars into a single map.
1020
    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1021
    NESTING_RESET_TAGS maps out of lists and partial maps."""
1022
    built = {}
1023
    for portion in args:
1024
        if hasattr(portion, 'items'):
1025
            #It's a map. Merge it.
1026
            for k,v in portion.items():
1027
                built[k] = v
1028
        elif hasattr(portion, '__iter__'): # is a list
1029
            #It's a list. Map each item to the default.
1030
            for k in portion:
1031
                built[k] = default
1032
        else:
1033
            #It's a scalar. Map it to the default.
1034
            built[portion] = default
1035
    return built
1036

  
1037
# Now, the parser classes.
1038

  
1039
class BeautifulStoneSoup(Tag, SGMLParser):
1040

  
1041
    """This class contains the basic parser and search code. It defines
1042
    a parser that knows nothing about tag behavior except for the
1043
    following:
1044

  
1045
      You can't close a tag without closing all the tags it encloses.
1046
      That is, "<foo><bar></foo>" actually means
1047
      "<foo><bar></bar></foo>".
1048

  
1049
    [Another possible explanation is "<foo><bar /></foo>", but since
1050
    this class defines no SELF_CLOSING_TAGS, it will never use that
1051
    explanation.]
1052

  
1053
    This class is useful for parsing XML or made-up markup languages,
1054
    or when BeautifulSoup makes an assumption counter to what you were
1055
    expecting."""
1056

  
1057
    SELF_CLOSING_TAGS = {}
1058
    NESTABLE_TAGS = {}
1059
    RESET_NESTING_TAGS = {}
1060
    QUOTE_TAGS = {}
1061
    PRESERVE_WHITESPACE_TAGS = []
1062

  
1063
    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1064
                       lambda x: x.group(1) + ' />'),
1065
                      (re.compile('<!\s+([^<>]*)>'),
1066
                       lambda x: '<!' + x.group(1) + '>')
1067
                      ]
1068

  
1069
    ROOT_TAG_NAME = u'[document]'
1070

  
1071
    HTML_ENTITIES = "html"
1072
    XML_ENTITIES = "xml"
1073
    XHTML_ENTITIES = "xhtml"
1074
    # TODO: This only exists for backwards-compatibility
1075
    ALL_ENTITIES = XHTML_ENTITIES
1076

  
1077
    # Used when determining whether a text node is all whitespace and
1078
    # can be replaced with a single space. A text node that contains
1079
    # fancy Unicode spaces (usually non-breaking) should be left
1080
    # alone.
1081
    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1082

  
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff